Beispiel #1
0
    def get_elasticsearch_latest(self, sync_params):
        """
        Get the latest docs
        :param sync_params: parameters of the sync
        :return:
        """
        # get configuration
        es = self.elastic['session']
        es_params = sync_params['elasticsearch']

        # construct query params
        query = {
            "query": {
                "constant_score": {
                    "filter": {
                        "and": [
                            {
                                "range": {
                                    sync_params['version_col']: {
                                        "gte":
                                        unix_time_millis(self.time_last_run),
                                        "lte":
                                        unix_time_millis(self.time_this_run)
                                    }
                                }
                            },
                        ]
                    }
                }
            }
        }

        # check if it should ignore the information that came from the same source that I am taking information to.
        ignore_source = sync_params.get('ignore_same_source', None)
        source_id = sync_params['cassandra'].get('source_id', None)
        if ignore_source:
            if source_id:
                query['query']['constant_score']['filter']['and'].append(
                    {"not": {
                        "term": {
                            "source": source_id
                        }
                    }})
            else:
                log.warning(
                    'ignore_same_source set but no source_id given for the Cassandra data'
                )

        # execute using scan to get all rows, unordered
        try:
            es.indices.refresh(index=es_params['index'])
            res = helpers.scan(es, query=query, index=es_params['index'])
            return res
        except:
            log.error('Sync: %s - Step: %s - Problem getting data' %
                      (sync_params['name'], sys._getframe().f_code.co_name))
            log.error(getError())
            return None
Beispiel #2
0
    def run(self):

        tuple_ip_port = (self.tuple_ip_id[0], UDP_PORT)

        while (conection_table.read_is_open(self.tuple_ip_id)):
            message = self.conn.recv(BUFFER_SIZE - HEADER_SIZE)
            print("> TCP Received:", message)
            if not message:
                pdu_fin = AnonPDU(
                    AnonHeader(PDU_State.END_CONNECTION,
                               PDU_Flags.FIN,
                               id=self.tuple_ip_id[1]))
                pdu_fin.encrypt(self.aes_key)
                print("    > Sent FIN")
                #self.conn.close()
                conection_table.unverified_put(self.tuple_ip_id, pdu_fin)
                udp_socket.sendto(pdu_fin.get_bytes(), tuple_ip_port)

                # Wait for ack process
                #begin = unix_time_millis(datetime.datetime.now())
                #while True:
                #    if conection_table.unverified_empty(self.tuple_ip_id):
                #        # If is empty then the packet received ack in the udp read thread
                #        break
                #
                #    end = unix_time_millis(datetime.datetime.now())
                #    if end - begin >= TIMEOUT_MILLIS:
                #        begin = unix_time_millis(datetime.datetime.now())
                #        print("    > Resent FIN")
                #        udp_socket.sendto(pdu_fin.get_bytes(), tuple_ip_port)
                #
                if conection_table.close_read(self.tuple_ip_id) == 1:
                    conection_table.remove(self.tuple_ip_id)

            else:
                pdu_msg = AnonPDU(AnonHeader(PDU_State.DATA_EXCHANGE,
                                             0,
                                             data_or_key=PDU_DataOrKey.DATA,
                                             id=self.tuple_ip_id[1]),
                                  payload=message)
                pdu_msg.encrypt(self.aes_key)  # Message encrypt

                conection_table.unverified_put(self.tuple_ip_id, pdu_msg)
                udp_socket.sendto(pdu_msg.get_bytes(), tuple_ip_port)

                # Wait for ack process
                begin = unix_time_millis(datetime.datetime.now())
                while True:
                    if conection_table.unverified_empty(self.tuple_ip_id):
                        # If is empty then the packet received ack in the udp read thread
                        break

                    end = unix_time_millis(datetime.datetime.now())
                    if end - begin >= TIMEOUT_MILLIS:
                        begin = unix_time_millis(datetime.datetime.now())
                        udp_socket.sendto(pdu_msg.get_bytes(), tuple_ip_port)

        print("TCP_Read_UDP_Write End")
Beispiel #3
0
def get_changing_information():
    now = datetime.now()
    percentage = 0
    speed = 0
    finishing_time = 0
    ETA = 0
    log_file = get_latest_log_file_name(datetime.now())
    if os.path.isfile(log_file):
        with open(log_file, 'r') as file_handler:
            value = utils.tail(file_handler, 1)
            if len(value) > 0:
                line = value[0]
                if 'ETA' in line:
                    percentage = float(line[line.find(']') + 2:line.find('%')])
                    speed = line[line.find('at ') + 3:line.find(' ETA')]
                    ETA = line[line.find('ETA ') + 4:]
                    if len(ETA) > 5:
                        time = datetime.strptime(ETA, '%H:%M:%S')
                        endTime = timedelta(hours=time.hour, minutes=time.minute, seconds=time.second)
                    else:
                        time = datetime.strptime(ETA, '%M:%S')
                        endTime = timedelta(minutes=time.minute, seconds=time.second)
                    finishing_time = utils.unix_time_millis(now + endTime)
    changing_video_information = [percentage, speed, finishing_time, ETA]
    return changing_video_information
Beispiel #4
0
    def get_comments(self):
        commentQuery = self.db.member_comments.member_id == self.memberID
        # WE ARE USING THE ABOVE DECLARED QUERY AS A FILTER ON THE DB, NOTICE TABLE IS DECLARED IN THE FILTER
        commentRows = self.db(commentQuery).select()
        if commentRows:
            from utils import unix_time_millis
            memberComments = []
            for comment in commentRows:
                thisComment = {
                    "made_by":
                    False,
                    "created":
                    unix_time_millis(comment.create_date)
                    if comment.create_date else comment.create_date,
                    "comment":
                    comment.comment
                }
                # SINCE WE ARE USING ROW ID, WE DO NOT NEED A QUERY, ON A FILTER AT THE TABLE LEVEL
                user = self.db.auth_user(comment.staff_id)
                if user:
                    thisComment[
                        "made_by"] = user.first_name + " " + user.last_name

                memberComments.append(thisComment)

            return memberComments
        return False
Beispiel #5
0
def main():
    num_problems = int(raw_input("How many problems would you like to create? "))
    for x in xrange(num_problems):
        problemdb.add_problem(generate_problem_name(), generate_sentence(), generate_hint(), generate_category(), generate_points(), "flag")

    num_teams = int(raw_input("How many teams would you like to create? "))
    for x in xrange(num_teams):
        base_time = datetime.datetime.utcnow() + datetime.timedelta(minutes=-(60*num_problems))
        solved = []
        team_name = generate_team_name()
        teamdb.add_team(team_name, "password", str(utils.unix_time_millis(base_time)))
        problems = problemdb.get_problems()

        for x in xrange(random.randint(1, num_problems)):
            pid = random.choice(problems)[0]
            if pid not in solved:
                new_base = random_date(base_time, base_time + datetime.timedelta(minutes=60))
                problemdb.submit_flag(team_name, pid, "flag", str(utils.unix_time_millis(new_base)))
                base_time = new_base
                solved.append(pid)
Beispiel #6
0
    def _createElasticSearchData(self,
                                 config,
                                 curr_time=None,
                                 amount=None,
                                 start=0):
        """
        Creates data on an index on Elasticsearch
        :param config: configuration
        :param curr_time: time to create data
        :param amount: amount to be created
        :param start: first id
        :return:the response from bulk insertion
        """
        params = config['elasticsearch']
        es = self.elastic['session']
        es.indices.create(index=params['index'], ignore=400)
        if not curr_time:
            curr_time = datetime.utcnow()
        if not amount:
            amount = self.data_amt

        # create data
        data = []
        for i in range(start, start + amount):
            action = dict(_type=params['type'],
                          _id=str(self.idList[i]),
                          _version_type='external',
                          _version=unix_time_millis(curr_time),
                          _source={
                              'text': id_generator(10),
                              'source': 'Elastic',
                              'date': curr_time,
                              'version': unix_time_millis(curr_time)
                          })
            data.append(action)

        # write to elasticsearch
        ret = bulk(es, data, chunk_size=700, index=params['index'])
        es.indices.flush(index=params['index'])

        return ret
Beispiel #7
0
def exercise_detail(request, exercise_id):
	try:
		exercise = Exercise.objects.get(pk=exercise_id)
	except Exercise.DoesNotExist:
		raise Http404

	weight_rep_data = []

	all_sets = exercise.set_set.all()
	for exercise_set in all_sets:
		weight_rep_data.append([unix_time_millis(exercise_set.session.date)  , float(exercise_set.weight_resistance) , float(exercise_set.reps_mins)])

	exercise_detail_graph = generate_exercise_detail_graph(weight_rep_data, str(exercise.name))

	return render(request, 'workouts/exercise_detail.html', {'exercise':exercise, 'all_sets':all_sets, 'exercise_detail_graph':exercise_detail_graph})
Beispiel #8
0
    def get_cassandra_latest(self, sync_params):
        """
         Get a cassandra family column latests updates
        :return:
            rows that are new or updated
        """
        # helpers
        session = self.cassandra['session']
        params = sync_params['cassandra']

        # construct the query and run it
        stmt = '''SELECT {fields_list}
                    FROM {table} '''
        stmt = stmt.format(fields_list=(params.get('fields_list', '*')),
                           table=params['table'])

        # check if could filter
        if sync_params.get('filter_date', None):
            stmt += '''WHERE {version_col} > {time_last_run}
                        AND {version_col} <= {time_this_run}
                        ALLOW FILTERING'''

            stmt = stmt.format(
                version_col=sync_params['version_col'],
                time_last_run=unix_time_millis(self.time_last_run),
                time_this_run=unix_time_millis(self.time_this_run))

        try:
            rows = session.execute(stmt)
        except:
            log.error('Sync: %s - Step: %s - Table: %s Problem getting data' %
                      (sync_params['name'], sys._getframe().f_code.co_name,
                       params['table']))
            log.error(getError())
            return None
        return rows
Beispiel #9
0
    def _createCassandraData(self,
                             config,
                             curr_time=None,
                             create_table=True,
                             amount=None,
                             start=0):
        """
        Will create 'tables' and populate data
        :param config: configuration
        :param curr_time: time to create data
        :param create_table: if the table should be created
        :param amount: amount to be created
        :param start: first id
        :return:
        """
        session = self.cassandra['session']
        keyspace = self.cassandra['keyspace']
        if not curr_time:
            curr_time = datetime.utcnow()
        if not amount:
            amount = self.data_amt
        params = config['cassandra']

        # create a table
        if create_table:
            stmt = ''' CREATE TABLE {keyspace}.{table} (
                          {id_col} UUID,
                          {version_col} bigint,
                          text varchar,
                          source varchar,
                          {date_col} timestamp,
                          PRIMARY KEY ({primary_key})
                        );'''

            # check if it will use date as a filter
            if config.get('filter_date', None):
                primary_key = '%s, %s' % (config['id_col'],
                                          config['version_col'])
            else:
                primary_key = config['id_col']
            stmt = stmt.format(keyspace=keyspace,
                               table=params['table'],
                               version_col=config['version_col'],
                               id_col=config['id_col'],
                               date_col=config['date_col'],
                               primary_key=primary_key)
            session.execute(stmt)
            if config.get('filter_date', None):
                session.execute(
                    'CREATE INDEX ON %s.%s (%s)' %
                    (keyspace, params['table'], config['version_col']))

        # Prepare the statements
        stmt = "INSERT INTO {keyspace}.{table} (" \
               "{id_col}, " \
               "{version_col}, " \
               "text, " \
               "source, " \
               "{date_col}) " \
               "VALUES (?, ?, ?, ?, ?)" \
               "USING TIMESTAMP ? "
        stmt = stmt.format(keyspace=keyspace,
                           table=params['table'],
                           version_col=config['version_col'],
                           id_col=config['id_col'],
                           date_col=config['date_col'])
        data_statement = session.prepare(stmt)

        # add the prepared statements to a batch
        count = 0
        batch = BatchStatement()
        for i in range(start, start + amount):
            batch.add(data_statement, [
                self.idList[i],
                unix_time_millis(curr_time),
                id_generator(10), 'CASSANDRA',
                unix_time_millis(curr_time),
                unix_time_millis(curr_time)
            ])
            count += 1

            # every x records, commit. The parameter 65000 was giving timeout
            if (count % 5000) == 0:
                # execute the batch
                session.execute(batch)
                # hack to get around the 65k limit of python driver
                batch._statements_and_parameters = []
                count = 0

        if count > 0:
            session.execute(batch)
            batch._statements_and_parameters = []
Beispiel #10
0
    def insert_cassandra(self, sync_params, rows):
        """
        Insert data into
        :rtype : object
        :param sync_params:
        :param rows:
        :return:
        """
        # helpers
        session = self.cassandra['session']
        params = sync_params['cassandra']
        keyspace = self.cassandra['keyspace']

        # get the table schema and order so that we can insert on query in correct order
        schema = self._get_table_schema(keyspace, params['table'])
        if not schema:
            return None, None
        cols = schema.keys()
        cols.sort()

        # Prepare the statements
        stmt = "INSERT INTO {keyspace}.{table} ("
        stmt += ", ".join(['%s' % k for k in cols])
        stmt += ") VALUES ("
        stmt += ", ".join([':' + k for k in cols])
        stmt += ") USING TIMESTAMP :p_timestamp "
        stmt = stmt.format(keyspace=keyspace, table=params['table'])

        try:
            data_statement = session.prepare(stmt)
        except:
            log.error('Sync: %s - Step: %s - Problem inserting data' %
                      (sync_params['name'], sys._getframe().f_code.co_name))
            log.error(getError())
            return None, None

        # add the prepared statements to a batch
        count = 0
        total = 0
        errors = 0
        batch = BatchStatement()
        cols.remove(sync_params['id_col'])
        for row in rows:
            # convert to the cassandra structure
            try:
                # fill the data dictionary and put none on columns that are not present
                data = {}
                source = row['_source']
                for col in cols:
                    data[col] = source.get(col, None)
                date = datetime.strptime(source[sync_params['date_col']],
                                         '%Y-%m-%dT%H:%M:%S.%f')
                data[sync_params['id_col']] = uuid.UUID(row['_id'])
                data[sync_params['date_col']] = unix_time_millis(date)
                data['p_timestamp'] = data['version']
                batch.add(data_statement, data)
                count += 1
            except:
                log.error('Problem converting data {}'.format(row['_id']))
                log.error(getError())
                continue

            # every x records, commit. There is a limitation on the driver
            if (count % 5000) == 0:
                try:
                    # execute the batch
                    session.execute(batch)
                    total += count
                except:
                    exc_info = sys.exc_info()
                    log.error(exc_info[1])
                    log.error(exc_info[2])
                    errors += count

                count = 0
                # hack to get around the 65k limit of python driver
                batch._statements_and_parameters = []

        if count > 0:
            try:
                # execute the batch
                session.execute(batch)
                total += count
            except:
                log.error(
                    'Sync: %s - Step: %s - Problem inserting data' %
                    (sync_params['name'], sys._getframe().f_code.co_name))
                log.error(getError())
                errors += count

        return total, errors
def convert_hca_json_to_magetab(mode,
                                data_dir,
                                project_uuids_filter=None,
                                new_only=True,
                                sender=None,
                                email_recipients=None):
    # Retrieve the HCA Json to MAGETAB translation config
    config = utils.get_config(process_name)
    idf_config = utils.get_val(config, 'idf')
    sdrf_config = utils.get_val(config, 'sdrf')

    logger = utils.create_logger(data_dir, process_name, mode)
    hca_api_url_root = utils.get_val(config, 'hca_api_url_root')
    # already_imported_project_uuids will be excluded from the import (and their json will not be cached)
    if new_only:
        already_imported_project_uuids = utils.get_previously_imported_projects(
            data_dir)
    else:
        already_imported_project_uuids = []

    project_uuids = hcadam.get_hca_project_uuid_to_import(
        hca_api_url_root, config, mode, project_uuids_filter,
        already_imported_project_uuids, logger)

    # project_uuid2gxa_accession dict forms the worklist of experiments to be imported from HCA
    project_uuid2gxa_accession = {}
    for project_uuid in project_uuids:
        project_uuid2gxa_accession[
            project_uuid] = hcadam.get_gxa_accession_for_project_uuid(
                project_uuid, config)
    project_uuid2gxa_accession = utils.resolve_gxa_accession_for_project_uuid(
        data_dir, project_uuid2gxa_accession)

    # Experiments imported from HCA DCC - for email report
    imported_experiments = []

    # Log experiments to be imported
    logger.info("About to import from HCA DCC the following experiments:")
    for project_uuid in project_uuid2gxa_accession.keys():
        logger.info("%s -> %s" %
                    (project_uuid, project_uuid2gxa_accession[project_uuid]))

    # Metadata retrieve starts here
    for project_uuid in project_uuid2gxa_accession.keys():
        time_start = utils.unix_time_millis(datetime.now())
        accession = project_uuid2gxa_accession.get(project_uuid)
        if new_only:
            # N.B. if new_only is True, HCA projects for which an idf file in data_dir doesn't exist will be imported
            idf_file_path = '%s/%s.idf.txt*' % (data_dir, accession)
            if glob.glob(idf_file_path):
                logger.info(
                    "Not importing %s as %s already exists (new_only mode: %s)"
                    % (accession, idf_file_path, str(new_only)))
                continue
        else:
            logger.info(
                'About to translate json for HCA study uuid: %s to magetab for gxa accession: %s'
                % (project_uuid, accession))

        # Retrieve all HCA json content for project_uuid
        hca_json_for_project_uuid = hcadam.get_json_for_project_uuid(
            project_uuid)

        # Initialise SDRF-related data structures and flags
        # Set of technologies found in bundles for a given project uuid. The presence of a technology name in that set acts as a flag that sdrf column headers have been collected for that technology.
        technologies_found = set([])

        # List of SDRF column headers (per technology) that will be output in each (technology-specific) sdrf file
        technology2sdrf_column_headers = {}
        # List value corresponding to each technology key in technology2rows dict will be used to accumulate rows of data to be output into the generated (technology-specific) SDRF file
        technology2rows = {}
        # For a given technology key, after all the bundles for a given project have been seen, the value (set) indexes of sdrf columns that are empty for this technology
        # (and therefore will be removed before the sdrf matrix is output into the sdrf file)
        # N.B. Before any bundles are seen, all columns are assumed to be empty until at least one value is encountered for each.
        technology2indexes_of_empty_columns = {}
        # Initialise IDF-related data structures (for data such as protocols - that need to be collected from all the bundles)
        # technology2protocol_type2protocols is used to store all protocol names - to be populated later inside IDF file
        technology2protocol_type2protocols = {}
        # technology2protocol_type2max_protocol_num_per_sample stores maximum number of protocols per technology-protocol_type in any given sample/bundle.
        # This number will dictate how many 'Protocol REF' columns should be output for that protocol_type in sdrf file for that technology
        technology2protocol_type2max_protocol_num_per_sample = {}

        # characteristic_values_in_bundle dict stores sets of (unique) values per characteristic found - in order
        # to later automatically generate the corresponding Factors - for all characteristics for which the values change across the experiment (across all technologies).
        # N.B. A simplifying assumption is made here that in a multi-technology experiment, each technology-specific portion will get the same Factors
        characteristic_values = OrderedDict()

        # Auxiliary counter - used to limit number of HCA bundles processed during testing
        bundle_cnt = 0
        for bundle_url in hca_json_for_project_uuid.keys():
            # We want to warn of missing fields for the first bundle (since each bundle will contain some technology), the test below
            # effectively checks if we're dealing with the first bundle or not
            warn_of_missing_fields = not technologies_found
            hca_json_for_bundle = hca_json_for_project_uuid[bundle_url]
            context = (accession, project_uuid, bundle_url)
            ####################################################
            #### Collect protocols for IDF from bundle_uuid ####
            ####################################################
            protocol_type2protocols_in_bundle = OrderedDict([])
            for protocol_key in utils.get_val(config, 'protocol_types'):
                protocol_type2protocols_in_bundle[protocol_key] = OrderedSet(
                    [])
                for schema_type in list(hca_json_for_bundle.keys()):
                    if re.search(r"" + protocol_key, schema_type):
                        for protocol_json in hca_json_for_bundle[schema_type]:
                            protocol_name = utils.get_hca_value(
                                utils.get_val(config,
                                              'hca_protocol_name_path'),
                                protocol_json, logger, config, False,
                                'Protocol Name', context)
                            if protocol_name != utils.get_val(
                                    config, 'notfound'):
                                protocol_description = utils.get_hca_value(
                                    utils.get_val(
                                        config,
                                        'hca_protocol_description_path'),
                                    protocol_json, logger, config, False,
                                    'Protocol Description', context)
                                protocol_type = utils.get_hca_value(
                                    utils.get_val(config,
                                                  'hca_protocol_type_path'),
                                    protocol_json, logger, config, False,
                                    'Protocol Type', context)
                                protocol_type2protocols_in_bundle[
                                    protocol_key].add(
                                        (protocol_name, protocol_description,
                                         protocol_type))

            ##################
            ###### SDRF ######
            ##################
            technology = None
            # Set of indexes of sdrf columns with non-empty sdrf columns for the current bundle_uuid
            indexes_of_non_empty_sdrf_columns = set([])
            # Output one SDRF row per each donor - sequence file tuple in the bundle
            # Assumptions relating to donors:
            # 1. Every HCA bundle has at least one json object for both: donor_organism and cell_suspension
            # 2. When multiple donor_organism and cell_suspension json objects exist, in the lists of json objects for donor_organism and
            #    cell_suspension respectively, the first JSON in donor_organism list corresponds to the first JSON in the cell_suspension list, and so on.
            #    However, in multi-donor samples with just one cell_suspension json object (e.g. project_uuid: d96c2451-6e22-441f-a3e6-70fd0878bb1b,
            #    bundle_url: https://dss.data.humancellatlas.org/v1/bundles/fb64e4f9-9a24-4a6a-856f-2b7c0d4f309d?version=2019-01-03T153203.452910Z&replica=aws
            #    that single cell_suspension json is assumed to apply to all donor_organism json objects in that bundle.
            donor_json_list = hca_json_for_bundle[utils.get_val(
                config, 'hca_donor_organism')]
            cell_suspension_json_list = hca_json_for_bundle[utils.get_val(
                config, 'hca_cell_suspension')]
            if len(cell_suspension_json_list) != len(donor_json_list) and len(
                    cell_suspension_json_list) != 1:
                err_msg = " Project: %s bundle: %s contain multiple donor_organism and cell_suspension jsons, but their number is not the same" % (
                    project_uuid, bundle_url)
                logger.error(err_msg)
                raise utils.HCA2MagetabTranslationError(err_msg)

            i = 0
            while i < len(donor_json_list):
                donor_json = donor_json_list[i]
                if len(cell_suspension_json_list) > 1:
                    cell_suspension_json = cell_suspension_json_list[i]
                else:
                    cell_suspension_json = cell_suspension_json_list[0]
                i += 1

                for datafile_json in hca_json_for_bundle[utils.get_val(
                        config, 'hca_sequence_file')]:
                    sdrf_column_headers = []

                    current_row = []
                    for line_item in sdrf_config:
                        magetab_label = line_item[0]
                        hca_path = line_item[1]
                        if isinstance(hca_path, list):
                            if not utils.position_valid_for_sdrf_column(
                                    magetab_label, sdrf_column_headers,
                                    config):
                                # Skip sdrf columns if the position in which they would be inserted would not be valid given the column just before: sdrf_column_headers[-1]
                                continue
                            elif magetab_label in [
                                    'Characteristics[geographical location]',
                                    'Characteristics[genotype]'
                            ]:
                                # Special handling/parsing - geographical location - multiple json files need checking for field presence
                                value = utils.get_val(config, 'notfound')
                                regex = hca_path[0]
                                for schema_type in list(
                                        hca_json_for_bundle.keys()):
                                    if re.search(r"" + regex, schema_type):
                                        for json_dict in hca_json_for_bundle[
                                                schema_type]:
                                            value = utils.get_hca_value(
                                                hca_path[1:], json_dict,
                                                logger, config,
                                                warn_of_missing_fields,
                                                magetab_label, context)
                                            if value != utils.get_val(
                                                    config, 'notfound'):
                                                break
                                utils.add_to_row(
                                    indexes_of_non_empty_sdrf_columns,
                                    sdrf_column_headers, magetab_label, value,
                                    current_row, characteristic_values, config)
                            elif magetab_label == 'Protocol REF':
                                protocol_type = hca_path[0]
                                # TODO:
                                # Note that before sdrf is output, we will need to split protocol_ids into separate columns, but not before processing all the bundles in the project - we have to wait till the
                                # end to we know how many protocols per technology-protocol type we have. _In theory_ we could have 3 different enrichment protocols for a given technology in a project, e.g.
                                # FACS3, FACS5 and FACS8, and for the current bundle protocol_ids = 'FACS3,FACS8'. Then before outputting sdrf we will have to 'explode' the 'Protocol REF' column corresponding
                                # to the enrichment protocol into 3 (tab-delimited) new columns - and these columns for the current bundle_uuid will have values: 'FACS3\tFACS8\t' and
                                # headers: 'Protocol REF\tProtocol REF\tProtocol REF'
                                protocol_ids = ','.join([
                                    x[0] for x in list(
                                        protocol_type2protocols_in_bundle[
                                            protocol_type])
                                ])
                                utils.add_to_row(
                                    indexes_of_non_empty_sdrf_columns,
                                    sdrf_column_headers, magetab_label,
                                    protocol_ids, current_row,
                                    characteristic_values, config)
                            elif len(hca_path) > 0 and re.search(
                                    r"" + utils.get_val(
                                        config, 'hca_protocol_schema_regex'),
                                    hca_path[0]):
                                protocol_type = hca_path[0]
                                # Special handling/parsing - for a given protocol_type, various protocol-related information needs to be collected from potentially multiple HCA json files
                                values = set([])
                                for schema_type in [
                                        x for x in hca_json_for_bundle.keys()
                                        if x == protocol_type
                                ]:
                                    for json_dict in hca_json_for_bundle[
                                            schema_type]:
                                        value = utils.get_hca_value(
                                            hca_path[1:], json_dict, logger,
                                            config, warn_of_missing_fields,
                                            magetab_label, context)
                                        if value != utils.get_val(
                                                config, 'notfound'):
                                            if magetab_label == 'Comment[library construction]':
                                                # Capture technology for the current bundle
                                                hca_technology = value.lower()
                                                technology = utils.get_gxa_technology(
                                                    hca_technology, config)
                                                value = technology
                                            values.add(str(value))
                                utils.add_to_row(
                                    indexes_of_non_empty_sdrf_columns,
                                    sdrf_column_headers, magetab_label,
                                    ', '.join(values), current_row,
                                    characteristic_values, config)
                            elif magetab_label == 'Comment[HCA bundle url]':
                                utils.add_to_row(
                                    indexes_of_non_empty_sdrf_columns,
                                    sdrf_column_headers, magetab_label,
                                    bundle_url, current_row,
                                    characteristic_values, config)
                            elif magetab_label in [
                                    'Comment[RUN]', 'Comment[FASTQ_URI]',
                                    'Scan Name',
                                    'Comment[technical replicate group]',
                                    'Comment[HCA file uuid]'
                            ]:
                                # Special handling/parsing - Comment[RUN] - datafile_key json file need checking for field presence
                                value = utils.get_hca_value(
                                    hca_path, datafile_json, logger, config,
                                    warn_of_missing_fields, magetab_label,
                                    context)
                                if magetab_label == 'Comment[RUN]':
                                    # NB. We're stripping e.g. _2.fastq.gz from the end - to retain just the core file name
                                    # Tested on the following types of file names:
                                    # "FCA7167226_I1.fastq.gz", "MantonBM7_HiSeq_4_S19_L005_R2_001.fastq.gz", "E18_20160930_Neurons_Sample_57_S054_L005_I1_010.fastq.gz", "FCA7167226.fastq.gz"
                                    value = re.sub(
                                        r"(\_\w\d|\_\w\d\_\d+|\_\d)*\.f\w+\.gz",
                                        "", value)
                                utils.add_to_row(
                                    indexes_of_non_empty_sdrf_columns,
                                    sdrf_column_headers, magetab_label, value,
                                    current_row, characteristic_values, config)
                            else:
                                schema_type = hca_path[0]
                                if schema_type in hca_json_for_bundle:
                                    if schema_type == utils.get_val(
                                            config, 'hca_donor_organism'):
                                        json_dict = donor_json
                                    elif schema_type == utils.get_val(
                                            config, 'hca_cell_suspension'):
                                        json_dict = cell_suspension_json
                                    else:
                                        # Retrieving the first element below follows the assumption of one single json object in schema_type in a bundle
                                        # (all the special cases were handled above)
                                        json_dict = hca_json_for_bundle[
                                            schema_type][0]
                                    value = utils.get_hca_value(
                                        hca_path[1:], json_dict, logger,
                                        config, warn_of_missing_fields,
                                        magetab_label, context)
                                else:
                                    value = utils.get_val(config, 'notfound')
                                if magetab_label in \
                                    ['Characteristics[organism]', 'Characteristics[disease]', 'Characteristics[cell subtype]', 'Characteristics[ethnic group]','Characteristics[strain]'] \
                                    and value != utils.get_val(config, 'notfound'):
                                    # Special handling/parsing - organism, disease - could be multiple according to HCA schema
                                    utils.add_to_row(
                                        indexes_of_non_empty_sdrf_columns,
                                        sdrf_column_headers, magetab_label,
                                        ','.join([x['text'] for x in value]),
                                        current_row, characteristic_values,
                                        config)
                                else:
                                    # magetab_label is not a list or a special case
                                    utils.add_to_row(
                                        indexes_of_non_empty_sdrf_columns,
                                        sdrf_column_headers, magetab_label,
                                        str(value), current_row,
                                        characteristic_values, config)
                        else:
                            # hca_path is not a list - add to the row as is
                            utils.add_to_row(indexes_of_non_empty_sdrf_columns,
                                             sdrf_column_headers,
                                             magetab_label, hca_path,
                                             current_row,
                                             characteristic_values, config)
                    # At least one bundle has been seen - the SDRF columns have now been determined
                    if technology:
                        # Append current_row to the list of rows in the SDRF file being generated
                        if technology not in technology2rows.keys():
                            technology2rows[technology] = []
                        technology2rows[technology].append(current_row)
                        # The presence of a technology name in that set acts as a flag that sdrf column headers have been collected for that technology.
                        if technology not in technologies_found:
                            technology2sdrf_column_headers[
                                technology] = sdrf_column_headers
                            # To start off with assume all columns are empty
                            technology2indexes_of_empty_columns[
                                technology] = range(len(sdrf_config))
                            # Initialise technology2protocol_type2protocols with new technology
                            technology2protocol_type2protocols[
                                technology] = OrderedDict()
                        technologies_found.add(technology)
                        # Store (without duplicates) for technology the protocols found for bundle_uuid (i.e. those in protocol_type2protocols_in_bundle)
                        for protocol_type in protocol_type2protocols_in_bundle.keys(
                        ):
                            num_protocols_in_bundle = len(
                                protocol_type2protocols_in_bundle[
                                    protocol_type])
                            if num_protocols_in_bundle > 0:
                                if technology not in technology2protocol_type2max_protocol_num_per_sample.keys(
                                ):
                                    technology2protocol_type2max_protocol_num_per_sample[
                                        technology] = OrderedDict({
                                            protocol_type:
                                            num_protocols_in_bundle
                                        })
                                elif protocol_type not in technology2protocol_type2max_protocol_num_per_sample[
                                        technology].keys():
                                    technology2protocol_type2max_protocol_num_per_sample[
                                        technology][
                                            protocol_type] = num_protocols_in_bundle
                                else:
                                    technology2protocol_type2max_protocol_num_per_sample[
                                        technology][protocol_type] = max(
                                            num_protocols_in_bundle,
                                            technology2protocol_type2max_protocol_num_per_sample[
                                                technology][protocol_type])
                                if protocol_type not in technology2protocol_type2protocols[
                                        technology].keys():
                                    technology2protocol_type2protocols[
                                        technology][
                                            protocol_type] = OrderedSet([])
                                # Merge set: protocol_type2protocols_in_bundle[protocol_type] into set already in technology2protocol_type2protocols[technology][protocol_type]
                                technology2protocol_type2protocols[technology][
                                    protocol_type] |= protocol_type2protocols_in_bundle[
                                        protocol_type]
                    else:
                        err_msg = "Failed to retrieve valid technology from value: \"%s\" in bundle: %s" % (
                            hca_technology, bundle_url)
                        logger.error(err_msg)
                        raise utils.HCA2MagetabTranslationError(err_msg)

            # Now remove from technology2indexes_of_empty_columns[technology] all column indexes we found non-empty values for, for the current bundle_uuid
            technology2indexes_of_empty_columns[technology] = [
                x for x in technology2indexes_of_empty_columns[technology]
                if x not in indexes_of_non_empty_sdrf_columns
            ]

            # Number of bundles processed per study - test mode cut-off
            if mode == 'test' and bundle_cnt >= utils.get_val(
                    config, 'test_max_bundles'):
                break

        # Now work out which Characteristics should be auto-generated as Factors also
        technology2factors = {}
        # Assumption - in experiments imported from HCA DCC, only one column for a unique characteristic name will be output in the resulting SDRF file
        technology2factor2characteristic_colnum = {}
        for technology in technologies_found:
            technology2factors[technology] = []
            technology2factor2characteristic_colnum[technology] = {}
            for characteristic in characteristic_values:
                if characteristic in technology2sdrf_column_headers[
                        technology] and len(
                            characteristic_values[characteristic]) > 1:
                    factor = re.sub("Characteristics", "FactorValue",
                                    characteristic)
                    technology2factors[technology].append(factor)
                    technology2sdrf_column_headers[technology].append(factor)
                    # Store index (in each sdrf row) of the characteristic corresponding factor, so that we know where to get the value from
                    # when populating factor values in sdrf later
                    technology2factor2characteristic_colnum[technology][
                        factor] = technology2sdrf_column_headers[
                            technology].index(characteristic)

            # Add Factor for single cell identifier (smart-seq2 experiments only)
            smart_regex = re.compile('smart-.*$')
            if smart_regex.match(technology):
                factor = 'FactorValue[single cell identifier]'
                technology2sdrf_column_headers[technology].append(factor)
                technology2factors[technology].append(factor)
                technology2factor2characteristic_colnum[technology][
                    factor] = technology2sdrf_column_headers[technology].index(
                        'Source Name')

        # For each technology, write out the generated SDRF file.
        # N.B. IF the HCA project is multi-technology, append the technology label to the end of the sdrf file name
        multi_technology_hca_project = len(technologies_found) > 1
        for technology in technologies_found:
            sdrf_file_name = "%s.sdrf.txt" % accession
            if multi_technology_hca_project:
                sdrf_file_name = "%s.%s" % (sdrf_file_name, technology)
            with open(os.path.join(data_dir, sdrf_file_name), 'wb') as f:
                csvwriter = csv.writer(f,
                                       delimiter='\t',
                                       encoding='utf-8',
                                       escapechar='\\',
                                       quotechar='',
                                       lineterminator='\n',
                                       quoting=csv.QUOTE_NONE)

                # Remove from technology2sdrf_column_headers[technology] headers of columns that are empty for this technology
                utils.remove_empty_columns(
                    technology2sdrf_column_headers[technology],
                    technology2indexes_of_empty_columns[technology])
                # Expand protocol column headers to account for multiple protocols per protocol_type, if applicable
                expanded_headers = technology2sdrf_column_headers[
                    technology].copy()
                utils.expand_protocol_columns(
                    None, expanded_headers,
                    technology2protocol_type2max_protocol_num_per_sample[
                        technology], logger)

                # Write out sdrf header line
                csvwriter.writerow(expanded_headers)

                for row in technology2rows[technology]:
                    # Append to row values for all the auto-generated factors
                    for factor in technology2factors[technology]:
                        row.append(row[technology2factor2characteristic_colnum[
                            technology][factor]])

                    # Remove from row elements in positions corresponding to columns that are empty for this technology
                    utils.remove_empty_columns(
                        row, technology2indexes_of_empty_columns[technology])
                    # Expand protocol values into multiple columns to account for multiple protocols per protocol_type, if applicable
                    utils.expand_protocol_columns(
                        row, technology2sdrf_column_headers[technology],
                        technology2protocol_type2max_protocol_num_per_sample[
                            technology], logger)

                    # Write out sdrf data line
                    csvwriter.writerow(row)

        #################
        ###### IDF ######
        #################
        for technology in technologies_found:
            idf_file_name = "%s.idf.txt" % accession
            if multi_technology_hca_project:
                idf_file_name = "%s.%s" % (idf_file_name, technology)
            with open(os.path.join(data_dir, idf_file_name), 'wb') as f:
                csvwriter = csv.writer(f,
                                       delimiter='\t',
                                       encoding='utf-8',
                                       escapechar='\\',
                                       quotechar='',
                                       lineterminator='\n',
                                       quoting=csv.QUOTE_NONE)
                for line_item in idf_config:
                    magetab_label = line_item[0]
                    hca_path = line_item[1]
                    if isinstance(hca_path, list):
                        if magetab_label in [
                                'Term Source Name', 'Term Source File'
                        ]:
                            # Special handling/parsing - hca_path is a list of literal values, rather than locations in HCA json files
                            csvwriter.writerow([magetab_label] + hca_path)
                            continue
                        if hca_path:
                            # Note the assumption that only one project_json object exists per bundle
                            # (c.f. hca_schemas_with_one_json_per_bundle_expected in hca2mtab.yml)
                            json_dict = hca_json_for_bundle[hca_path[0]][0]
                        value = utils.get_hca_value(hca_path[1:], json_dict,
                                                    logger, config, True,
                                                    magetab_label, context)
                        if magetab_label in [
                                'Public Release Date'
                        ] and value != utils.get_val(config, 'notfound'):
                            # Special handling/parsing - Public Release date, Comment[HCALastUpdateDate], Comment[HCAReleaseDate]
                            m = re.search(r'^(\d{4}\-\d{2}\-\d{2}).*$', value)
                            if m:
                                value = m.group(1)
                            else:
                                logger.error(
                                    "Failed to parse date out of: %s" % value)
                                value = ''
                            csvwriter.writerow([magetab_label, value])
                        elif magetab_label in [
                                'Comment[ExpressionAtlasAccession]',
                                'SDRF File'
                        ]:
                            # Special handling/parsing - use previously derived accession
                            value = accession
                            if magetab_label == 'SDRF File':
                                # SDRF file name - derive from experiment accession
                                value = re.sub(r'\.idf\.', '.sdrf.',
                                               idf_file_name)
                            candidate_acc_regex_obj = re.compile('E-CAND-\d+')
                            if magetab_label == 'SDRF File' or (
                                    magetab_label
                                    == 'Comment[ExpressionAtlasAccession]'
                                    and not candidate_acc_regex_obj.match(
                                        accession)):
                                csvwriter.writerow([magetab_label, value])
                        elif magetab_label in ['Comment[HCALastUpdateDate]']:
                            csvwriter.writerow([
                                magetab_label,
                                datetime.now().strftime("%Y-%m-%d")
                            ])
                        elif magetab_label == 'Comment[SecondaryAccession]':
                            # Special handling - secondary accessions
                            secondary_accessions = OrderedSet([])
                            for label in utils.get_val(
                                    config,
                                    'hca_old_secondary_accessions_labels'):
                                hca_project_json = hca_json_for_bundle[
                                    utils.get_val(config, 'hca_project')]
                                if label in hca_project_json:
                                    secondary_accessions.add(
                                        hca_project_json[label])
                            # For the reason for the loop below see a comment near hca_old_secondary_accessions_labels in hca2mtab.yml
                            for label in utils.get_val(
                                    config,
                                    'hca_new_secondary_accessions_labels'):
                                if label in hca_project_json:
                                    for secondary_accession in hca_project_json[
                                            label]:
                                        secondary_accessions.add(
                                            secondary_accession)
                            # Now append the HCA study uuid
                            secondary_accessions.add(project_uuid)
                            if len(secondary_accessions) > 0:
                                csvwriter.writerow(
                                    ['Comment[SecondaryAccession]'] +
                                    list(secondary_accessions))
                        elif magetab_label in [
                                'Experimental Factor Name',
                                'Experimental Factor Type'
                        ]:
                            # Special handling - populate factors that where auto-generated in SDRF above
                            idf_line = [magetab_label]
                            for factor in technology2factors[technology]:
                                m = re.search(r'\[(.*)\]', factor)
                                if m:
                                    idf_line.append(m.group(1))
                                else:
                                    err_msg = "Failed to extract Factor name from %s" % factor
                                    logger.error(err_msg)
                                    raise utils.HCA2MagetabTranslationError(
                                        err_msg)
                            csvwriter.writerow(idf_line)
                        elif isinstance(magetab_label, list):
                            if re.search('Person Last Name', magetab_label[0]):
                                # Special handling/parsing - Contributors
                                contact_rows = OrderedDict()
                                for row_label in magetab_label:
                                    contact_rows[row_label] = []
                                for contact in utils.get_hca_value(
                                        hca_path[1:], json_dict, logger,
                                        config, True, magetab_label, context):
                                    contact_name_arr = contact[
                                        'contact_name'].split(',')
                                    contact_rows['Person Last Name'].append(
                                        contact_name_arr[0])
                                    contact_rows['Person First Name'].append(
                                        contact_name_arr[-1].lstrip())
                                    if len(contact_name_arr) == 3:
                                        contact_rows[
                                            'Person Mid Initials'].append(
                                                contact_name_arr[1])
                                for contact in utils.get_hca_value(
                                        hca_path[1:], json_dict, logger,
                                        config, True, magetab_label, context):
                                    email = utils.get_hca_value(
                                        ['email'], contact, logger, config,
                                        True, magetab_label, context)
                                    contact_rows['Person Email'].append(
                                        email if email != utils.
                                        get_val(config, 'notfound') else '')
                                    contact_rows['Person Affiliation'].append(
                                        contact['institution'])
                                for contact in utils.get_hca_value(
                                        hca_path[1:], json_dict, logger,
                                        config, True, magetab_label, context):
                                    address = utils.get_hca_value(
                                        ['address'], contact, logger, config,
                                        True, magetab_label, context)
                                    contact_rows['Person Address'].append(
                                        address if address != utils.
                                        get_val(config, 'notfound') else '')
                                for key in list(contact_rows.keys()):
                                    csvwriter.writerow([key] +
                                                       contact_rows[key])
                            elif 'Protocol Name' == magetab_label[0]:
                                # Special handling/parsing - Protocols
                                protocol_rows = OrderedDict()
                                for row_label in magetab_label:
                                    protocol_rows[row_label] = []
                                for protocol_type in technology2protocol_type2protocols[
                                        technology].keys():
                                    # Traverse through protocol tuples in alphabetic order - by protocol name
                                    for protocol_tuple in sorted(
                                            technology2protocol_type2protocols[
                                                technology][protocol_type],
                                            key=lambda x: x[0]):
                                        protocol_rows['Protocol Name'].append(
                                            protocol_tuple[0])
                                        protocol_rows[
                                            'Protocol Description'].append(
                                                protocol_tuple[1]
                                                if protocol_tuple[1] != utils.
                                                get_val(config, 'notfound')
                                                else '')
                                        protocol_rows['Protocol Type'].append(
                                            protocol_tuple[2] if
                                            protocol_tuple[2] != utils.get_val(
                                                config, 'notfound') else '')
                                for key in list(protocol_rows.keys()):
                                    csvwriter.writerow([key] +
                                                       protocol_rows[key])
                            elif re.search('Publication Title',
                                           magetab_label[0]):
                                if utils.get_hca_value(
                                        hca_path[1:], json_dict, logger,
                                        config, True, magetab_label[0],
                                        context) == utils.get_val(
                                            config, 'notfound'):
                                    # Skip the publications-related idf config
                                    continue
                                # Special handling/parsing - Publications
                                publication_rows = OrderedDict()
                                for row_label in 'Publication Title', 'Publication Author List', 'PubMed ID', 'Publication DOI':
                                    publication_rows[row_label] = []
                                for publication in utils.get_hca_value(
                                        hca_path[1:], json_dict, logger,
                                        config, True, magetab_label, context):
                                    publication_rows[
                                        'Publication Title'].append(
                                            utils.get_hca_value(
                                                utils.get_val(
                                                    config,
                                                    'hca_publication_title_path'
                                                ), publication, logger, config,
                                                True, magetab_label, context))
                                    publication_rows[
                                        'Publication Author List'].append(
                                            ', '.join(
                                                utils.get_hca_value(
                                                    utils.get_val(
                                                        config,
                                                        'hca_publication_authors_path'
                                                    ), publication, logger,
                                                    config, True,
                                                    magetab_label, context)))
                                    pubmed_id = utils.get_hca_value(
                                        utils.get_val(
                                            config,
                                            'hca_publication_pmid_path'),
                                        publication, logger, config, True,
                                        magetab_label, context)
                                    publication_rows['PubMed ID'].append(
                                        str(pubmed_id
                                            ) if str(pubmed_id) != utils.
                                        get_val(config, 'notfound') else '')
                                    publication_doi = utils.get_hca_value(
                                        utils.get_val(
                                            config,
                                            'hca_publication_doi_path'),
                                        publication, logger, config, True,
                                        magetab_label, context)
                                    publication_rows['Publication DOI'].append(
                                        publication_doi
                                        if publication_doi != utils.
                                        get_val(config, 'notfound') else '')

                                for key in list(publication_rows.keys()):
                                    csvwriter.writerow([key] +
                                                       publication_rows[key])
                        else:
                            # magetab_label is not a list or a special case
                            csvwriter.writerow([magetab_label, value])
                            if magetab_label == 'Investigation Title':
                                imported_experiments.append(
                                    "%s (%s - %d bundles): %s" %
                                    (accession, technology,
                                     len(hca_json_for_project_uuid.keys()),
                                     value))
                    else:
                        # hca_path is not a list
                        csvwriter.writerow(line_item)
        time_end = utils.unix_time_millis(datetime.now())
        duration = (time_end - time_start) / 1000 / 60
        logger.info(
            "Processing HCA study uuid: %s for gxa accession: %s took %d mins"
            % (project_uuid, accession, duration))
    if imported_experiments and sender and email_recipients:
        utils.email_report("New experiments imported from HCA DCC",
                           '\n'.join(imported_experiments), sender,
                           email_recipients)
Beispiel #12
0
def compare_weeks(filename, title, df_c, df_h):
    HMO_num = filename[2:7]
    print(HMO_num)

    df_c['avg'] = df_c["PrimT"].mean()
    df_h['avg'] = df_h["PrimT"].mean()

    start_time = utils.unix_time_millis(df_c.first_valid_index())

    times = df_c.index.to_series().apply(
        lambda x: dt.datetime.strftime(x, '%A %H:%M')).tolist()
    trace_hwt_outlet_cold = go.Scatter(x=times,
                                       y=df_c.HwTOutlet,
                                       name="HW Outlet - Cold Week",
                                       connectgaps=True,
                                       line=dict(color='#1f77b4'))
    # HW Outlet is the temperature at the outlet going to the hot water system

    trace_temp_outlet_cold = go.Scatter(x=times,
                                        y=df_c.PrimT,
                                        name="Primary Temp - Cold Week",
                                        connectgaps=True,
                                        line=dict(color='#ff7f0e'))
    # Primary temperature is the temp measured at the top of the burner that heats up the water
    trace_boiler_average_cold = go.Scatter(x=times,
                                           y=df_c.avg,
                                           name="Boiler Average - Cold Week",
                                           connectgaps=True,
                                           line=dict(color='#2ca02c'))

    trace_hwt_outlet_hot = go.Scatter(x=times,
                                      y=df_h.HwTOutlet,
                                      name="HW Outlet - Hot Week",
                                      connectgaps=True,
                                      line=dict(color='#1f77b4'))
    # HW Outlet is the temperature at the outlet going to the hot water system

    trace_temp_outlet_hot = go.Scatter(x=times,
                                       y=df_h.PrimT,
                                       name="Primary Temp - Hot Week",
                                       connectgaps=True,
                                       line=dict(color='#ff7f0e'))
    # Primary temperature is the temp measured at the top of the burner that heats up the water
    trace_boiler_average_hot = go.Scatter(x=times,
                                          y=df_h.avg,
                                          name="Boiler Average - Hot Week",
                                          connectgaps=True,
                                          line=dict(color='#2ca02c'))

    fig = tools.make_subplots(rows=2,
                              cols=1,
                              specs=[[{}], [{}]],
                              subplot_titles=('HMO Analysis of Cold Week',
                                              'HMO Analysis of Hot Week'),
                              shared_xaxes=True,
                              shared_yaxes=False,
                              vertical_spacing=0.1)
    fig.append_trace(trace_hwt_outlet_cold, 1, 1)
    fig.append_trace(trace_temp_outlet_cold, 1, 1)
    fig.append_trace(trace_boiler_average_cold, 1, 1)
    fig.append_trace(trace_hwt_outlet_hot, 2, 1)
    fig.append_trace(trace_temp_outlet_hot, 2, 1)
    fig.append_trace(trace_boiler_average_hot, 2, 1)

    fig['layout'].update(title=title,
                         yaxis=dict(title='Temperature'),
                         xaxis=dict(tick0=start_time, nticks=21))

    plot(fig, filename=filename, auto_open=False)

    boiler_temp_threshold = 50
    df_c['tag'] = df_c.PrimT > boiler_temp_threshold
    df_h['tag'] = df_h.PrimT > boiler_temp_threshold

    time_spent_boiler_c = (df_c.PrimT >= 50).values.sum() / 20
    max_time_spent_boiler_c = get_longest_hot_boiler(df_c)
    time_spent_boiler_h = (df_h.PrimT >= 50).values.sum() / 20
    max_time_spent_boiler_h = get_longest_hot_boiler(df_h)

    cold_week_text = "During the cold week, the boiler was above 50°C for {} hours, with the longest period being {} " \
                     "hours long, on average it was {}°C ".format(time_spent_boiler_c, max_time_spent_boiler_c,
                                                                  df_c.avg.values[0])
    hot_week_text = "During the hot week, the boiler was above 50°C for {} hours, with the longest period being {} " \
                    "hours long, on average it was {}°C ".format(time_spent_boiler_h, max_time_spent_boiler_h,
                                                                 df_h.avg.values[0])
    f = open("./{}/additional_data.txt".format(HMO_num), "a+")
    f.write(cold_week_text + "\r\n")
    f.write(hot_week_text + "\r\n")

    print(cold_week_text)

    print(hot_week_text)
Beispiel #13
0
def plot(filename, title, df):
    # df2 = df[df.index % 10 == 0]  # Selects every 10th row starting from 0
    df = df.fillna(method="ffill")
    print(df.head(20))
    if 'Time' in df.columns:
        df['datetime'] = pd.to_datetime(df['Time'])
    else:
        df['datetime'] = pd.to_datetime(df['datetime'])

    df.index = df['datetime']

    start_time = utils.unix_time_millis(df.first_valid_index())

    # act_pow = df.ActPow
    # # print(act_pow[act_pow.notnull()])
    # act_pow = act_pow.replace(0, pd.np.nan)  # .dropna(axis=0, how='any').fillna(0).astype(int)

    # print(act_pow.loc[:, (act_pow != 0).any(axis=0)])
    # print(act_pow.head(20))

    df = df.resample("15T").mean()
    # df = df.replace(pd.np.nan, 0)  # .dropna(axis=0, how='any').fillna(0).astype(int)

    print(df.head(9))
    # commented out because resampling should hopefully get the mean now
    # hwt_outlet = df.HwTOutlet.rolling(60).mean()
    # hwt_outlet.drop(hwt_outlet.index[:59], inplace=True)
    # temp_outlet = df.PrimT.rolling(60).mean()
    # temp_outlet.drop(temp_outlet.index[:59], inplace=True)
    # print(hwt_outlet.head(10))
    # hwt_set = go.Scatter(x=df["Time"], y=df["HwTSet"], name="HW Setpoint", connectgaps=True)
    # trace_temp_set = go.Scatter(x=df.index, y=df.PrimTSet, name="Primary Temp Setpoint", connectgaps=True)

    trace_hwt_outlet = go.Scatter(x=df.index,
                                  y=df.HwTOutlet,
                                  name="HW Outlet",
                                  connectgaps=True)  # HW Outlet
    # is the temperature at the outlet going to the hot water system
    # trace_act_power = go.Scatter(x=df.index, y=act_pow, name="Actual Power", yaxis='y2')
    trace_temp_outlet = go.Scatter(x=df.index,
                                   y=df.PrimT,
                                   name="Primary Temp",
                                   connectgaps=True)  # Primary
    # temperature is the temp measured at the top of the burner that heats up the water

    layout = go.Layout(
        title=title,
        yaxis=dict(title='Temperature'),
        # yaxis2=dict(
        #     title='Load on Boiler',
        #     overlaying='y',
        #     # hoverformat='.0%',
        #     side='right',
        #     showgrid=False,
        #     zeroline=False,
        #     showline=False,
        # ),
        xaxis=dict(
            tick0=start_time,
            rangeselector=dict(buttons=list([
                dict(count=7, label='1w', step='day', stepmode='backward'),
                dict(count=1, label='1m', step='month', stepmode='backward'),
                dict(step='all')
            ])),
            rangeslider=dict(visible=True),
            type='date'))

    plotly.offline.plot(
        {
            "data": [trace_hwt_outlet, trace_temp_outlet
                     ],  # trace_temp_set, trace_act_power],
            "layout": layout
        },
        filename=filename,
        auto_open=False)