Example #1
0
def bq_hpo_update_by_id(hpo_id):
  gen = BQHPOGenerator()
  # get from main database in case the backup is not synch in time
  bqr = gen.make_bqrecord(hpo_id, backup=False)
  w_dao = BigQuerySyncDao()
  with w_dao.session() as w_session:
    gen.save_bqrecord(hpo_id, bqr, bqtable=BQHPO, w_dao=w_dao, w_session=w_session)
def bq_site_update_by_id(site_id):
    gen = BQSiteGenerator()
    bqr = gen.make_bqrecord(site_id, backup=False)
    w_dao = BigQuerySyncDao()
    with w_dao.session() as w_session:
        gen.save_bqrecord(site_id,
                          bqr,
                          bqtable=BQSite,
                          w_dao=w_dao,
                          w_session=w_session)
Example #3
0
def bq_organization_update_by_id(org_id):
    gen = BQOrganizationGenerator()
    bqr = gen.make_bqrecord(org_id, backup=False)
    w_dao = BigQuerySyncDao()
    with w_dao.session() as w_session:
        gen.save_bqrecord(org_id,
                          bqr,
                          bqtable=BQOrganization,
                          w_dao=w_dao,
                          w_session=w_session)
Example #4
0
 def make_bqrecord(self, hpo_id, convert_to_enum=False, backup=True):
   """
   Build a BQRecord object from the given hpo id.
   :param hpo_id: Primary key value from hpo table.
   :param convert_to_enum: If schema field description includes Enum class info, convert value to Enum.
   :param backup: if True, get from backup database
   :return: BQRecord object
   """
   ro_dao = BigQuerySyncDao(backup=backup)
   with ro_dao.session() as ro_session:
     row = ro_session.execute(text('select * from hpo where hpo_id = :id'), {'id': hpo_id}).first()
     data = ro_dao.to_dict(row)
     return BQRecord(schema=BQHPOSchema, data=data, convert_to_enum=convert_to_enum)
Example #5
0
def rebuild_bigquery_handler():
    """
  Cron job handler, setup queued tasks to rebuild bigquery data
  # TODO: Future: Currently rebuild is synchronous, this could be asynchronous if we
  #       passed a set of participant ids to each task.  GET requests are limited to 2,083
  #       characters, so we probably would have to create a temp MySQL table to store
  #       batches of participant ids and then we could pass a batch id in the GET request.
  """
    timestamp = datetime.utcnow()
    batch_size = 300

    ro_dao = BigQuerySyncDao(backup=True)
    with ro_dao.session() as ro_session:
        total_rows = ro_session.query(func.count(
            Participant.participantId)).first()[0]
        count = int(math.ceil(float(total_rows) / float(batch_size)))
        logging.info(
            'Calculated {0} tasks from {1} records and a batch size of {2}.'.
            format(count, total_rows, batch_size))

        while count > 0:
            task = taskqueue.add(queue_name='bigquery-rebuild',
                                 url='/rdr/v1/BQRebuildTaskApi',
                                 method='GET',
                                 target='worker',
                                 params={
                                     'timestamp': timestamp,
                                     'limit': batch_size
                                 })

            logging.info('Task {} enqueued, ETA {}.'.format(
                task.name, task.eta))
            count -= 1
    #
    # Process tables that don't need to be broken up into smaller tasks.
    #
    # Code Table
    deferrered_bq_codebook_update()
    # HPO Table
    bq_hpo_update()
    # Organization Table
    bq_organization_update()
    # Site Table
    bq_site_update()
Example #6
0
    def make_bqrecord(self, p_id, convert_to_enum=False):
        """
    Build a Participant Summary BQRecord object for the given participant id.
    :param p_id: participant id
    :param convert_to_enum: If schema field description includes Enum class info, convert value to Enum.
    :return: BQRecord object
    """
        if not self.ro_dao:
            self.ro_dao = BigQuerySyncDao(backup=True)

        with self.ro_dao.session() as session:
            # prep participant info from Participant record
            summary = self._prep_participant(p_id, session)
            # prep ConsentPII questionnaire information
            summary = self._merge_schema_dicts(
                summary, self._prep_consentpii_answers(p_id, session))
            # prep questionnaire modules information, includes gathering extra consents.
            summary = self._merge_schema_dicts(
                summary, self._prep_modules(p_id, session))
            # prep physical measurements
            summary = self._merge_schema_dicts(
                summary, self._prep_physical_measurements(p_id, session))
            # prep race and gender
            summary = self._merge_schema_dicts(
                summary, self._prep_the_basics(p_id, session))
            # prep biobank orders and samples
            summary = self._merge_schema_dicts(
                summary, self._prep_biobank_info(p_id, session))
            # calculate enrollment status for participant
            summary = self._merge_schema_dicts(
                summary, self._calculate_enrollment_status(summary))
            # calculate distinct visits
            summary = self._merge_schema_dicts(
                summary, self._calculate_distinct_visits(summary))

            return BQRecord(schema=BQParticipantSummarySchema,
                            data=summary,
                            convert_to_enum=convert_to_enum)
Example #7
0
def rebuild_bq_participant(p_id, ps_bqgen=None, pdr_bqgen=None):
    """
  Rebuild a BQ record for a specific participant
  :param p_id: participant id
  :param ps_bqgen: BQParticipantSummaryGenerator object
  :param pdr_bqgen: BQPDRParticipantSummaryGenerator object
  :return:
  """
    # Allow for batch requests to rebuild participant summary data.
    if not ps_bqgen:
        ps_bqgen = BQParticipantSummaryGenerator()
    if not pdr_bqgen:
        from dao.bq_pdr_participant_summary_dao import BQPDRParticipantSummaryGenerator
        pdr_bqgen = BQPDRParticipantSummaryGenerator()

    ps_bqr = ps_bqgen.make_bqrecord(p_id)
    # Since the PDR participant summary is primarily a subset of the Participant Summary, call the full
    # Participant Summary generator and take what we need from it.
    pdr_bqr = pdr_bqgen.make_bqrecord(p_id, ps_bqr=ps_bqr)

    w_dao = BigQuerySyncDao()
    with w_dao.session() as w_session:
        # save the participant summary record.
        ps_bqgen.save_bqrecord(p_id,
                               ps_bqr,
                               bqtable=BQParticipantSummary,
                               w_dao=w_dao,
                               w_session=w_session)
        # save the PDR participant summary record
        pdr_bqgen.save_bqrecord(p_id,
                                pdr_bqr,
                                bqtable=BQPDRParticipantSummary,
                                w_dao=w_dao,
                                w_session=w_session)

    return ps_bqr
Example #8
0
def bq_hpo_update(project_id=None):
  """
  Generate all new HPO records for BQ. Since there is called from a tool, this is not deferred.
  :param project_id: Override the project_id
  """
  ro_dao = BigQuerySyncDao(backup=True)
  with ro_dao.session() as ro_session:
    gen = BQHPOGenerator()
    results = ro_session.query(HPO.hpoId).all()

  w_dao = BigQuerySyncDao()
  with w_dao.session() as w_session:
    logging.info('BQ HPO table: rebuilding {0} records...'.format(len(results)))
    for row in results:
      bqr = gen.make_bqrecord(row.hpoId)
      gen.save_bqrecord(row.hpoId, bqr, bqtable=BQHPO, w_dao=w_dao, w_session=w_session, project_id=project_id)
def deferrered_bq_codebook_update():
    """
  Generate all new Codebook records for BQ.
  """
    ro_dao = BigQuerySyncDao(backup=True)
    with ro_dao.session() as ro_session:
        gen = BQCodeGenerator()
        results = ro_session.query(Code.codeId).all()

    w_dao = BigQuerySyncDao()
    with w_dao.session() as w_session:
        logging.info('Code table: rebuilding {0} records...'.format(
            len(results)))
        for row in results:
            bqr = gen.make_bqrecord(row.codeId)
            gen.save_bqrecord(row.codeId,
                              bqr,
                              bqtable=BQCode,
                              w_dao=w_dao,
                              w_session=w_session)
def deferred_bq_questionnaire_update(p_id, qr_id):
    """
  Generate a BQ questionnaire response record from the given p_id and questionnaire response id.
  :param p_id: participant id
  :param qr_id: A questionnaire response id.
  """
    # determine what the module id is for the given questionnaire response id.
    sql = text("""
    select c.value from
        questionnaire_response qr inner join questionnaire_concept qc on qr.questionnaire_id = qc.questionnaire_id
        inner join code c on qc.code_id = c.code_id
    where qr.questionnaire_response_id = :qr_id
  """)

    ro_dao = BigQuerySyncDao(backup=True)
    w_dao = BigQuerySyncDao()
    qr_gen = BQPDRQuestionnaireResponseGenerator()
    module_id = None

    with ro_dao.session() as ro_session:

        results = ro_session.execute(sql, {'qr_id': qr_id})
        if results:
            for row in results:
                module_id = row.value
                break

            if not module_id:
                logging.warning(
                    'No questionnaire module id found for questionnaire response id {0}'
                    .format(qr_id))
                return

            table, bqrs = qr_gen.make_bqrecord(p_id, module_id, latest=True)
            with w_dao.session() as w_session:
                for bqr in bqrs:
                    qr_gen.save_bqrecord(qr_id,
                                         bqr,
                                         bqtable=table,
                                         w_dao=w_dao,
                                         w_session=w_session)
    def get_fields(self):
        """
    Look up a participant id who has submitted this module and then get the module response answers to use
    for creating the schema.
    :return: list of fields
    """
        fields = list()
        self._fields = list()
        # Standard fields that must be in every BigQuery table.
        fields.append({
            'name': 'id',
            'type': BQFieldTypeEnum.INTEGER.name,
            'mode': BQFieldModeEnum.REQUIRED.name
        })
        fields.append({
            'name': 'created',
            'type': BQFieldTypeEnum.DATETIME.name,
            'mode': BQFieldModeEnum.REQUIRED.name
        })
        fields.append({
            'name': 'modified',
            'type': BQFieldTypeEnum.DATETIME.name,
            'mode': BQFieldModeEnum.REQUIRED.name
        })
        fields.append({
            'name': 'authored',
            'type': BQFieldTypeEnum.DATETIME.name,
            'mode': BQFieldModeEnum.NULLABLE.name
        })
        fields.append({
            'name': 'language',
            'type': BQFieldTypeEnum.STRING.name,
            'mode': BQFieldModeEnum.NULLABLE.name
        })
        fields.append({
            'name': 'participant_id',
            'type': BQFieldTypeEnum.INTEGER.name,
            'mode': BQFieldModeEnum.REQUIRED.name
        })
        fields.append({
            'name': 'questionnaire_response_id',
            'type': BQFieldTypeEnum.INTEGER.name,
            'mode': BQFieldModeEnum.REQUIRED.name
        })

        dao = BigQuerySyncDao()

        _sql_term = text("""
        select convert(qh.resource using utf8) as resource 
          from questionnaire_concept qc inner join code c on qc.code_id = c.code_id
               inner join questionnaire_history qh on qc.questionnaire_id = qh.questionnaire_id and 
                          qc.questionnaire_version = qh.version
        where c.value = :mod
        order by qh.created desc limit 1;
    """)

        with dao.session() as session:

            # get a participant id that has submitted the module
            result = session.execute(_sql_term, {'mod': self._module}).first()
            if not result:
                return fields

            qn_mod = json.loads(result[0])
            if 'resourceType' not in qn_mod or 'group' not in qn_mod:
                return fields

            for qn in qn_mod['group']['question']:
                # To support 1) The user supplied answer, 2) question skipped or 3) user was not given this question. We
                # have to store all question responses as Strings in BigQuery.
                field = qn['concept'][0].get('code', None)
                if not field:
                    continue

                name = qn['concept'][0]['code']
                if name in self._excluded_fields:
                    continue

                field = dict()
                field['name'] = name
                field['type'] = BQFieldTypeEnum.STRING.name
                field['mode'] = BQFieldModeEnum.NULLABLE.name
                field['enum'] = None
                fields.append(field)

            return fields
Example #12
0
 def __init__(self):
     super(BQRebuildTaskApi, self).__init__(BigQuerySyncDao())
    def make_bqrecord(self,
                      p_id,
                      module_id,
                      latest=False,
                      convert_to_enum=False):
        """
    Generate a list of questionnaire module BQRecords for the given participant id.
    :param p_id: participant id
    :param module_id: A questionnaire module id, IE: 'TheBasics'.
    :param latest: only process the most recent response if True
    :param convert_to_enum: If schema field description includes Enum class info, convert value to Enum.
    :return: BQTable object, List of BQRecord objects
    """
        if not self.ro_dao:
            self.ro_dao = BigQuerySyncDao(backup=True)

        if module_id == 'TheBasics':
            table = BQPDRTheBasics
        elif module_id == 'ConsentPII':
            table = BQPDRConsentPII
        elif module_id == 'Lifestyle':
            table = BQPDRLifestyle
        elif module_id == 'OverallHealth':
            table = BQPDROverallHealth
        elif module_id == 'DVEHRSharing':
            table = BQPDRDVEHRSharing
        elif module_id == 'EHRConsentPII':
            table = BQPDREHRConsentPII
        elif module_id == 'FamilyHistory':
            table = BQPDRFamilyHistory
        elif module_id == 'HealthcareAccess':
            table = BQPDRHealthcareAccess
        elif module_id == 'PersonalMedicalHistory':
            table = BQPDRPersonalMedicalHistory
        else:
            logging.info(
                'Generator: ignoring questionnaire module id {0}.'.format(
                    module_id))
            return None, list()

        qnans = self.ro_dao.call_proc('sp_get_questionnaire_answers',
                                      args=[module_id, p_id])
        if not qnans or len(qnans) == 0:
            return None, list()

        bqrs = list()
        for qnan in qnans:
            bqr = BQRecord(schema=table().get_schema(),
                           data=qnan,
                           convert_to_enum=convert_to_enum)
            bqr.participant_id = p_id  # reset participant_id.

            fields = bqr.get_fields()
            for field in fields:
                fld_name = field['name']
                if fld_name in ('id', 'created', 'modified', 'authored',
                                'language', 'participant_id',
                                'questionnaire_response_id'):
                    continue

                fld_value = getattr(bqr, fld_name, None)
                if fld_value is None:  # Let empty strings pass.
                    continue
                # question responses values need to be coerced to a String type.
                if isinstance(fld_value, (datetime.date, datetime.datetime)):
                    setattr(bqr, fld_name, fld_value.isoformat())
                else:
                    try:
                        setattr(bqr, fld_name, str(fld_value))
                    except UnicodeEncodeError:
                        setattr(bqr, fld_name, unicode(fld_value))

                # Truncate zip codes to 3 digits
                if fld_name in ('StreetAddress_PIIZIP',
                                'EmploymentWorkAddress_ZipCode'
                                ) and len(fld_value) > 2:
                    setattr(bqr, fld_name, fld_value[:3])

            bqrs.append(bqr)
            if latest:
                break

        return table, bqrs
Example #14
0
def rebuild_bq_participant_task(timestamp, limit=0):
    """
  Loop through all participants and generate the BQ participant summary data and
  store it in the biguqery_sync table.
  Warning: this will force a rebuild and eventually a re-sync for every participant record.
  :param timestamp: datetime: to be used to rebuild any records old than this.
  :param limit: integer: 0 = all, otherwise only process records until limit has been reached.
  """
    if not limit or not isinstance(limit, int) or limit < 0:
        raise ValueError('invalid limit value.')

    if not timestamp:
        timestamp = datetime.utcnow()

    # try:
    #   app_id = app_identity.get_application_id()
    # except AttributeError:
    #   app_id = 'localhost'
    ro_dao = BigQuerySyncDao(backup=True)
    ps_bqgen = BQParticipantSummaryGenerator()
    pdr_bqgen = BQPDRParticipantSummaryGenerator()
    mod_bqgen = BQPDRQuestionnaireResponseGenerator()

    with ro_dao.session() as ro_session:
        count = 0
        while limit:
            limit -= 1
            # Collect all participants who do not have a PS generated yet or the modified date is less than the timestamp.
            sq = ro_session.query(Participant.participantId, BigQuerySync.id, BigQuerySync.modified).\
                    outerjoin(BigQuerySync, and_(
                      BigQuerySync.pk_id == Participant.participantId,
                      BigQuerySync.tableId.in_(('participant_summary', 'pdr_participant')))).subquery()
            query = ro_session.query(sq.c.participant_id.label('participantId')).\
                        filter(or_(sq.c.id == None, sq.c.modified < timestamp)).\
                        order_by(sq.c.modified)
            query = query.limit(1)

            # sql = dao.query_to_text(query)
            results = query.all()

            for row in results:
                count += 1
                # All logic for generating a participant summary is here.
                rebuild_bq_participant(row.participantId,
                                       ps_bqgen=ps_bqgen,
                                       pdr_bqgen=pdr_bqgen)

                # Generate participant questionnaire module response data
                modules = (BQPDRConsentPII, BQPDRTheBasics, BQPDRLifestyle,
                           BQPDROverallHealth, BQPDREHRConsentPII,
                           BQPDRDVEHRSharing)
                for module in modules:
                    mod = module()
                    table, mod_bqrs = mod_bqgen.make_bqrecord(
                        row.participantId,
                        mod.get_schema().get_module_name())
                    if not table:
                        continue

                    w_dao = BigQuerySyncDao()
                    with w_dao.session() as w_session:
                        for mod_bqr in mod_bqrs:
                            mod_bqgen.save_bqrecord(
                                mod_bqr.questionnaire_response_id,
                                mod_bqr,
                                bqtable=table,
                                w_dao=w_dao,
                                w_session=w_session)

        logging.info(
            'Rebuilt BigQuery data for {0} participants.'.format(count))
Example #15
0
def sync_bigquery_handler(dryrun=False):
    """
  Cron entry point, Sync MySQL records to bigquery.
  :param dryrun: Don't send to bigquery if True
  Links for Streaming Inserts:
  # https://cloud.google.com/bigquery/streaming-data-into-bigquery
  # https://cloud.google.com/bigquery/streaming-data-into-bigquery#dataconsistency
  # https://cloud.google.com/bigquery/docs/reference/rest/v2/tabledata/insertAll
  # https://cloud.google.com/bigquery/troubleshooting-errors#streaming
  """
    ro_dao = BigQuerySyncDao(backup=True)
    bq = build('bigquery', 'v2') if dryrun is False else None
    total_inserts = 0
    # Google says maximum of 500 in a batch. Pretty sure they are talking about log shipping, I believe
    # that one participant summary record is larger than their expected average log record size.
    batch_size = 250
    run_limit = (
        2 * 60
    ) - 10  # Only run for 110 seconds before exiting, so we don't have overlapping cron jobs.
    limit_reached = False
    start_ts = datetime.now()
    table_list = list()

    with ro_dao.session() as ro_session:
        tables = ro_session.query(BigQuerySync.projectId, BigQuerySync.datasetId, BigQuerySync.tableId).\
                            distinct(BigQuerySync.projectId, BigQuerySync.datasetId, BigQuerySync.tableId).all()
        # don't always process the list in the same order so we don't get stuck processing the same table each run.
        for table_row in tables:
            table_list.append(
                (table_row.projectId, table_row.datasetId, table_row.tableId))
        random.shuffle(table_list)

        for item in table_list:
            project_id = item[0]
            dataset_id = item[1]
            table_id = item[2]
            count = 0
            errors = ''
            error_count = 0
            try:
                # pylint: disable=unused-variable
                max_created, max_modified = _get_remote_max_timestamps(project_id, dataset_id, table_id) \
                                                  if dryrun is False else (datetime.min, datetime.min)
            except httplib.HTTPException:
                logging.warning(
                    'Failed to retrieve max date values from bigquery, skipping this run.'
                )
                return 0

            # figure out how many records need to be sync'd and divide into slices.
            total_rows = ro_session.query(BigQuerySync.id). \
                        filter(BigQuerySync.projectId == project_id, BigQuerySync.tableId == table_id,
                               BigQuerySync.datasetId == dataset_id, BigQuerySync.modified >= max_modified).count()

            if total_rows == 0:
                logging.info('No rows to sync for {0}.{1}.'.format(
                    dataset_id, table_id))
                continue
            slices = int(math.ceil(float(total_rows) / float(batch_size)))
            slice_num = 0

            while slice_num < slices:
                results = ro_session.query(BigQuerySync.id, BigQuerySync.created, BigQuerySync.modified). \
                      filter(BigQuerySync.projectId == project_id, BigQuerySync.tableId == table_id,
                             BigQuerySync.datasetId == dataset_id, BigQuerySync.modified >= max_modified).\
                      order_by(BigQuerySync.modified).limit(batch_size).all()
                slice_num += 1
                batch = list()

                for row in results:
                    count += 1
                    max_modified = row.modified
                    rec = ro_session.query(BigQuerySync.resource).filter(
                        BigQuerySync.id == row.id).first()
                    if isinstance(rec.resource, (str, unicode)):
                        rec_data = json.loads(rec.resource)
                    else:
                        rec_data = rec.resource
                    rec_data['id'] = row.id
                    rec_data['created'] = row.created.isoformat()
                    rec_data['modified'] = row.modified.isoformat()
                    data = {'insertId': str(row.id), 'json': rec_data}
                    batch.append(data)

                if len(batch) > 0:
                    result, resp = insert_batch_into_bq(
                        bq, project_id, dataset_id, table_id, batch, dryrun)
                    if result is False:
                        # errors are cumulative, so wait until the end of the while
                        # statement before printing errors.
                        errors = resp
                        error_count += len(resp['insertErrors'])
                # Don't exceed our execution time limit.
                if (datetime.now() - start_ts).seconds > run_limit:
                    logging.info(
                        'Hit {0} second time limit.'.format(run_limit))
                    limit_reached = True
                    break

            if errors:
                logging.error(errors)

            total_inserts += (count - error_count)
            msg = '{0} inserts and {1} errors for {2}.{3}.{4}'.format(
                count, error_count, project_id, dataset_id, table_id)
            if error_count == 0:
                logging.info(msg)
            else:
                logging.info(msg)

            if limit_reached:
                break

    return total_inserts
Example #16
0
class BQParticipantSummaryGenerator(BigQueryGenerator):
    """
  Generate a Participant Summary BQRecord object
  """
    ro_dao = None

    def make_bqrecord(self, p_id, convert_to_enum=False):
        """
    Build a Participant Summary BQRecord object for the given participant id.
    :param p_id: participant id
    :param convert_to_enum: If schema field description includes Enum class info, convert value to Enum.
    :return: BQRecord object
    """
        if not self.ro_dao:
            self.ro_dao = BigQuerySyncDao(backup=True)

        with self.ro_dao.session() as session:
            # prep participant info from Participant record
            summary = self._prep_participant(p_id, session)
            # prep ConsentPII questionnaire information
            summary = self._merge_schema_dicts(
                summary, self._prep_consentpii_answers(p_id, session))
            # prep questionnaire modules information, includes gathering extra consents.
            summary = self._merge_schema_dicts(
                summary, self._prep_modules(p_id, session))
            # prep physical measurements
            summary = self._merge_schema_dicts(
                summary, self._prep_physical_measurements(p_id, session))
            # prep race and gender
            summary = self._merge_schema_dicts(
                summary, self._prep_the_basics(p_id, session))
            # prep biobank orders and samples
            summary = self._merge_schema_dicts(
                summary, self._prep_biobank_info(p_id, session))
            # calculate enrollment status for participant
            summary = self._merge_schema_dicts(
                summary, self._calculate_enrollment_status(summary))
            # calculate distinct visits
            summary = self._merge_schema_dicts(
                summary, self._calculate_distinct_visits(summary))

            return BQRecord(schema=BQParticipantSummarySchema,
                            data=summary,
                            convert_to_enum=convert_to_enum)

    def _prep_participant(self, p_id, session):
        """
    Get the information from the participant record
    :param p_id: participant id
    :param session: DAO session object
    :return: dict
    """
        p = session.query(Participant).filter(
            Participant.participantId == p_id).first()
        if not p:
            raise LookupError(
                'participant lookup for P{0} failed.'.format(p_id))

        hpo = session.query(HPO.name).filter(HPO.hpoId == p.hpoId).first()
        organization = session.query(Organization.externalId). \
                              filter(Organization.organizationId == p.organizationId).first()

        withdrawal_status = WithdrawalStatus(p.withdrawalStatus)
        withdrawal_reason = WithdrawalReason(
            p.withdrawalReason if p.withdrawalReason else 0)
        suspension_status = SuspensionStatus(p.suspensionStatus)

        data = {
            'participant_id': p_id,
            'biobank_id': p.biobankId,
            'last_modified': p.lastModified,
            'sign_up_time': p.signUpTime,
            'hpo': hpo.name if hpo else None,
            'hpo_id': p.hpoId,
            'organization': organization.externalId if organization else None,
            'organization_id': p.organizationId,
            'withdrawal_status': str(withdrawal_status),
            'withdrawal_status_id': int(withdrawal_status),
            'withdrawal_reason': str(withdrawal_reason),
            'withdrawal_reason_id': int(withdrawal_reason),
            'withdrawal_time': p.withdrawalTime,
            'withdrawal_authored': p.withdrawalAuthored,
            'withdrawal_reason_justification': p.withdrawalReasonJustification,
            'suspension_status': str(suspension_status),
            'suspension_status_id': int(suspension_status),
            'suspension_time': p.suspensionTime,
            'site': self._lookup_site_name(p.siteId, session),
            'site_id': p.siteId,
            'is_ghost_id': 1 if p.isGhostId is True else 0
        }

        return data

    def _prep_consentpii_answers(self, p_id, ro_session):
        """
    Get participant information from the ConsentPII questionnaire
    :param p_id: participant id
    :param ro_session: Readonly DAO session object
    :return: dict
    """
        qnans = self.ro_dao.call_proc('sp_get_questionnaire_answers',
                                      args=['ConsentPII', p_id])
        if not qnans or len(qnans) == 0:
            # return the minimum data required when we don't have the questionnaire data.
            return {'email': None, 'is_ghost_id': 0}
        qnan = BQRecord(schema=None,
                        data=qnans[0])  # use only most recent response.

        # TODO: We may need to use the first response to set consent dates,
        #  unless the consent value changed across response records.

        data = {
            'first_name':
            qnan.PIIName_First,
            'middle_name':
            qnan.PIIName_Middle,
            'last_name':
            qnan.PIIName_Last,
            'date_of_birth':
            qnan.PIIBirthInformation_BirthDate,
            'primary_language':
            qnan.language,
            'email':
            qnan.ConsentPII_EmailAddress,
            'phone_number':
            qnan.PIIContactInformation_Phone,
            'login_phone_number':
            qnan.ConsentPII_VerifiedPrimaryPhoneNumber,
            'addresses': [{
                'addr_type':
                BQStreetAddressTypeEnum.RESIDENCE.name,
                'addr_type_id':
                BQStreetAddressTypeEnum.RESIDENCE.value,
                'addr_street_address_1':
                qnan.PIIAddress_StreetAddress,
                'addr_street_address_2':
                qnan.PIIAddress_StreetAddress2,
                'addr_city':
                qnan.StreetAddress_PIICity,
                'addr_state':
                qnan.StreetAddress_PIIState.replace('PIIState_', '').upper()
                if qnan.StreetAddress_PIIState else None,
                'addr_zip':
                qnan.StreetAddress_PIIZIP,
                'addr_country':
                'US'
            }],
            'consents': [
                {
                    'consent':
                    'ConsentPII',
                    'consent_id':
                    self._lookup_code_id('ConsentPII', ro_session),
                    'consent_date':
                    parser.parse(qnan.authored).date()
                    if qnan.authored else None,
                    'consent_value':
                    'ConsentPermission_Yes',
                    'consent_value_id':
                    self._lookup_code_id('ConsentPermission_Yes', ro_session),
                },
            ]
        }

        return data

    def _prep_modules(self, p_id, ro_session):
        """
    Find all questionnaire modules the participant has completed and loop through them.
    :param p_id: participant id
    :param ro_session: Readonly DAO session object
    :return: dict
    """
        code_id_query = ro_session.query(func.max(QuestionnaireConcept.codeId)).\
                            filter(QuestionnaireResponse.questionnaireId ==
                                    QuestionnaireConcept.questionnaireId).label('codeId')
        query = ro_session.query(
                      QuestionnaireResponse.questionnaireResponseId, QuestionnaireResponse.authored,
                      QuestionnaireResponse.created, QuestionnaireResponse.language, code_id_query).\
                    filter(QuestionnaireResponse.participantId == p_id).\
                    order_by(QuestionnaireResponse.questionnaireResponseId)
        # sql = self.dao.query_to_text(query)
        results = query.all()

        data = dict()
        modules = list()
        consents = list()
        baseline_modules = ['TheBasics', 'OverallHealth', 'Lifestyle']
        try:
            baseline_modules = config.getSettingList(
                'baseline_ppi_questionnaire_fields')
        except ValueError:
            pass
        except AssertionError:  # unittest errors because of GCP SDK
            pass

        consent_modules = {
            # module: question code string
            'DVEHRSharing': 'DVEHRSharing_AreYouInterested',
            'EHRConsentPII': 'EHRConsentPII_ConsentPermission',
        }

        if results:
            for row in results:
                module_name = self._lookup_code_value(row.codeId, ro_session)
                modules.append({
                    'mod_module':
                    module_name,
                    'mod_baseline_module':
                    1
                    if module_name in baseline_modules else 0,  # Boolean field
                    'mod_authored':
                    row.authored,
                    'mod_created':
                    row.created,
                    'mod_language':
                    row.language,
                    'mod_status':
                    BQModuleStatusEnum.SUBMITTED.name,
                    'mod_status_id':
                    BQModuleStatusEnum.SUBMITTED.value,
                })

                # check if this is a module with consents.
                if module_name not in consent_modules:
                    continue
                qnans = self.ro_dao.call_proc('sp_get_questionnaire_answers',
                                              args=[module_name, p_id])
                if qnans and len(qnans) > 0:
                    qnan = BQRecord(
                        schema=None,
                        data=qnans[0])  # use only most recent questionnaire.
                    consents.append({
                        'consent':
                        consent_modules[module_name],
                        'consent_id':
                        self._lookup_code_id(consent_modules[module_name],
                                             ro_session),
                        'consent_date':
                        parser.parse(qnan.authored).date()
                        if qnan.authored else None,
                        'consent_value':
                        qnan[consent_modules[module_name]],
                        'consent_value_id':
                        self._lookup_code_id(
                            qnan[consent_modules[module_name]], ro_session),
                    })

        if len(modules) > 0:
            data['modules'] = modules
            if len(consents) > 0:
                data['consents'] = consents

        return data

    def _prep_the_basics(self, p_id, ro_session):
        """
    Get the participant's race and gender selections
    :param p_id: participant id
    :param ro_session: Readonly DAO session object
    :return: dict
    """
        qnans = self.ro_dao.call_proc('sp_get_questionnaire_answers',
                                      args=['TheBasics', p_id])
        if not qnans or len(qnans) == 0:
            return {}

        # get race question answers
        qnan = BQRecord(schema=None,
                        data=qnans[0])  # use only most recent questionnaire.
        data = {}
        if qnan.Race_WhatRaceEthnicity:
            rl = list()
            for val in qnan.Race_WhatRaceEthnicity.split(','):
                rl.append({
                    'race': val,
                    'race_id': self._lookup_code_id(val, ro_session)
                })
            data['races'] = rl
        # get gender question answers
        gl = list()
        if qnan.Gender_GenderIdentity:
            for val in qnan.Gender_GenderIdentity.split(','):
                if val == 'GenderIdentity_AdditionalOptions':
                    continue
                gl.append({
                    'gender': val,
                    'gender_id': self._lookup_code_id(val, ro_session)
                })
        # get additional gender answers, if any.
        if qnan.GenderIdentity_SexualityCloserDescription:
            for val in qnan.GenderIdentity_SexualityCloserDescription.split(
                    ','):
                gl.append({
                    'gender': val,
                    'gender_id': self._lookup_code_id(val, ro_session)
                })

        if len(gl) > 0:
            data['genders'] = gl

        data['education'] = qnan.EducationLevel_HighestGrade
        data['education_id'] = self._lookup_code_id(
            qnan.EducationLevel_HighestGrade, ro_session)
        data['income'] = qnan.Income_AnnualIncome
        data['income_id'] = self._lookup_code_id(qnan.Income_AnnualIncome,
                                                 ro_session)
        data['sex'] = qnan.BiologicalSexAtBirth_SexAtBirth
        data['sex_id'] = self._lookup_code_id(
            qnan.BiologicalSexAtBirth_SexAtBirth, ro_session)
        data['sexual_orientation'] = qnan.TheBasics_SexualOrientation
        data['sexual_orientation_id'] = self._lookup_code_id(
            qnan.TheBasics_SexualOrientation, ro_session)

        return data

    def _prep_physical_measurements(self, p_id, ro_session):
        """
    Get participant's physical measurements information
    :param p_id: participant id
    :param ro_session: Readonly DAO session object
    :return: dict
    """
        data = {}
        pm_list = list()

        query = ro_session.query(PhysicalMeasurements.created, PhysicalMeasurements.createdSiteId,
                                 PhysicalMeasurements.final, PhysicalMeasurements.finalized,
                                 PhysicalMeasurements.finalizedSiteId, PhysicalMeasurements.status,
                                 PhysicalMeasurements.cancelledTime).\
                filter(PhysicalMeasurements.participantId == p_id).\
                order_by(desc(PhysicalMeasurements.created))
        # sql = self.dao.query_to_text(query)
        results = query.all()

        for row in results:
            # row.status is not really used, we can only determine status
            if not row.cancelledTime:
                status = PhysicalMeasurementsStatus.COMPLETED
            else:
                status = PhysicalMeasurementsStatus.CANCELLED

            pm_list.append({
                'pm_status':
                str(status),
                'pm_status_id':
                int(status),
                'pm_created':
                row.created,
                'pm_created_site':
                self._lookup_site_name(row.createdSiteId, ro_session),
                'pm_created_site_id':
                row.createdSiteId,
                'pm_finalized':
                row.finalized,
                'pm_finalized_site':
                self._lookup_site_name(row.finalizedSiteId, ro_session),
                'pm_finalized_site_id':
                row.finalizedSiteId,
            })

        if len(pm_list) > 0:
            data['pm'] = pm_list
        return data

    def _prep_biobank_info(self, p_id, ro_session):
        """
    Look up biobank orders
    :param p_id: participant id
    :param ro_session: Readonly DAO session object
    :return:
    """
        data = {}
        orders = list()
        baseline_tests = [
            "1ED04", "1ED10", "1HEP4", "1PST8", "2PST8", "1SST8", "2SST8",
            "1PS08", "1SS08", "1UR10", "1CFD9", "1PXR2", "1UR90", "2ED10"
        ]
        try:
            baseline_tests = config.getSettingList(
                'baseline_sample_test_codes')
        except ValueError:
            pass
        except AssertionError:  # unittest errors because of GCP SDK
            pass

        dna_tests = ["1ED10", "2ED10", "1ED04", "1SAL", "1SAL2"]
        try:
            dna_tests = config.getSettingList('dna_sample_test_codes')
        except ValueError:
            pass
        except AssertionError:  # unittest errors because of GCP SDK
            pass

        sql = """
      select bo.biobank_order_id, bo.created, bo.collected_site_id, bo.processed_site_id, bo.finalized_site_id, 
              bos.test, bos.collected, bos.processed, bos.finalized, bo.order_status,
              bss.confirmed as bb_confirmed, bss.created as bb_created, bss.disposed as bb_disposed, 
              bss.status as bb_status, (
                select count(1) from biobank_dv_order bdo where bdo.biobank_order_id = bo.biobank_order_id
              ) as dv_order
        from biobank_order bo inner join biobank_ordered_sample bos on bo.biobank_order_id = bos.order_id
                inner join biobank_order_identifier boi on bo.biobank_order_id = boi.biobank_order_id
                left outer join 
                  biobank_stored_sample bss on boi.`value` = bss.biobank_order_identifier and bos.test = bss.test
        where boi.`system` = 'https://www.pmi-ops.org' and bo.participant_id = :pid
        order by bo.biobank_order_id, bos.test;
    """

        cursor = ro_session.execute(sql, {'pid': p_id})
        results = [r for r in cursor]
        # loop through results and create one order record for each biobank_order_id value.
        for row in results:
            if not filter(
                    lambda order: order['bbo_biobank_order_id'] == row.
                    biobank_order_id, orders):
                orders.append({
                    'bbo_biobank_order_id':
                    row.biobank_order_id,
                    'bbo_created':
                    row.created,
                    'bbo_status':
                    str(
                        BiobankOrderStatus(row.order_status) if row.
                        order_status else BiobankOrderStatus.UNSET),
                    'bbo_status_id':
                    int(
                        BiobankOrderStatus(row.order_status) if row.
                        order_status else BiobankOrderStatus.UNSET),
                    'bbo_dv_order':
                    0 if row.dv_order == 0 else 1,  # Boolean field
                    'bbo_collected_site':
                    self._lookup_site_name(row.collected_site_id, ro_session),
                    'bbo_collected_site_id':
                    row.collected_site_id,
                    'bbo_processed_site':
                    self._lookup_site_name(row.processed_site_id, ro_session),
                    'bbo_processed_site_id':
                    row.processed_site_id,
                    'bbo_finalized_site':
                    self._lookup_site_name(row.finalized_site_id, ro_session),
                    'bbo_finalized_site_id':
                    row.finalized_site_id,
                })
        # loop through results again and add each sample to it's order.
        for row in results:
            # get the order list index for this sample record
            try:
                idx = orders.index(
                    filter(
                        lambda order: order['bbo_biobank_order_id'] == row.
                        biobank_order_id, orders)[0])
            except IndexError:
                continue
            # if we haven't added any samples to this order, create an empty list.
            if 'bbo_samples' not in orders[idx]:
                orders[idx]['bbo_samples'] = list()
            # append the sample to the order
            orders[idx]['bbo_samples'].append({
                'bbs_test':
                row.test,
                'bbs_baseline_test':
                1 if row.test in baseline_tests else 0,  # Boolean field
                'bbs_dna_test':
                1 if row.test in dna_tests else 0,  # Boolean field
                'bbs_collected':
                row.collected,
                'bbs_processed':
                row.processed,
                'bbs_finalized':
                row.finalized,
                'bbs_confirmed':
                row.bb_confirmed,
                'bbs_status':
                str(SampleStatus.RECEIVED) if row.bb_confirmed else None,
                'bbs_status_id':
                int(SampleStatus.RECEIVED) if row.bb_confirmed else None,
                'bbs_created':
                row.bb_created,
                'bbs_disposed':
                row.bb_disposed,
                'bbs_disposed_reason':
                str(SampleStatus(row.bb_status)) if row.bb_status else None,
                'bbs_disposed_reason_id':
                int(SampleStatus(row.bb_status)) if row.bb_status else None,
            })

        if len(orders) > 0:
            data['biobank_orders'] = orders
        return data

    def _calculate_enrollment_status(self, summary):
        """
    Calculate the participant's enrollment status
    :param summary: summary data
    :return: dict
    """
        if 'consents' not in summary:
            return {}
        try:
            baseline_modules = config.getSettingList(
                'baseline_ppi_questionnaire_fields')
        except ValueError:
            baseline_modules = ['TheBasics', 'OverallHealth', 'Lifestyle']

        study_consent = ehr_consent = dvehr_consent = pm_complete = False
        status = None
        # iterate over consents
        for consent in summary['consents']:
            if consent['consent'] == 'ConsentPII':
                study_consent = True
            if consent['consent'] == 'EHRConsentPII_ConsentPermission' and \
                                  consent['consent_value'] == 'ConsentPermission_Yes':
                ehr_consent = True
            if consent['consent'] == 'DVEHRSharing_AreYouInterested' and \
                                  consent['consent_value'] == 'DVEHRSharing_Yes':
                dvehr_consent = True

        # check physical measurements
        if 'pm' in summary and summary['pm']:
            for row in summary['pm']:
                if row['pm_status_id'] == int(
                        PhysicalMeasurementsStatus.COMPLETED):
                    pm_complete = True
                    break

        baseline_module_count = dna_sample_count = 0
        if 'modules' in summary:
            baseline_module_count = len(
                filter(lambda module: module['mod_baseline_module'] == 1,
                       summary['modules']))
        if 'biobank_orders' in summary:
            for order in summary['biobank_orders']:
                if 'bbo_samples' in order:
                    dna_sample_count += len(
                        filter(lambda sample: sample['bbs_dna_test'] == 1,
                               order['bbo_samples']))

        if study_consent:
            status = EnrollmentStatus.INTERESTED
        if ehr_consent or dvehr_consent:
            status = EnrollmentStatus.MEMBER
        if pm_complete and 'modules' in summary and baseline_module_count == len(baseline_modules) and \
                dna_sample_count > 0:
            status = EnrollmentStatus.FULL_PARTICIPANT

        # TODO: Get Enrollment dates for additional fields -> participant_summary_dao.py:499

        # TODO: Calculate EHR status and dates -> participant_summary_dao.py:707

        data = {
            'enrollment_status': str(status) if status else None,
            'enrollment_status_id': int(status) if status else None,
        }
        return data

    def _calculate_distinct_visits(self, summary):  # pylint: disable=unused-argument
        """
    Calculate the distinct number of visits.
    :param summary: summary data
    :return: dict
    """
        def datetime_to_date(val):
            """
      Change from UTC to middle of the US before extracting date. That way if we have an early and late visit
      they will end up as the same day.
      """
            tmp = val.replace(tzinfo=tz.tzutc()).astimezone(
                tz.gettz('America/Denver'))
            return datetime.date(tmp.year, tmp.month, tmp.day)

        data = {}
        dates = list()

        if 'pm' in summary:
            for pm in summary['pm']:
                if pm['pm_status_id'] != int(PhysicalMeasurementsStatus.
                                             CANCELLED) and pm['pm_finalized']:
                    dates.append(datetime_to_date(pm['pm_finalized']))

        if 'biobank_orders' in summary:
            for order in summary['biobank_orders']:
                if order['bbo_status_id'] != int(BiobankOrderStatus.CANCELLED
                                                 ) and 'bbo_samples' in order:
                    for sample in order['bbo_samples']:
                        if 'bbs_finalized' in sample and sample['bbs_finalized'] and \
                                          isinstance(sample['bbs_finalized'], datetime.datetime):
                            dates.append(
                                datetime_to_date(sample['bbs_finalized']))

        dates = list(set(dates))  # de-dup list
        data['distinct_visits'] = len(dates)
        return data