def get_elasticsearch_latest(self, sync_params): """ Get the latest docs :param sync_params: parameters of the sync :return: """ # get configuration es = self.elastic['session'] es_params = sync_params['elasticsearch'] # construct query params query = { "query": { "constant_score": { "filter": { "and": [ { "range": { sync_params['version_col']: { "gte": unix_time_millis(self.time_last_run), "lte": unix_time_millis(self.time_this_run) } } }, ] } } } } # check if it should ignore the information that came from the same source that I am taking information to. ignore_source = sync_params.get('ignore_same_source', None) source_id = sync_params['cassandra'].get('source_id', None) if ignore_source: if source_id: query['query']['constant_score']['filter']['and'].append( {"not": { "term": { "source": source_id } }}) else: log.warning( 'ignore_same_source set but no source_id given for the Cassandra data' ) # execute using scan to get all rows, unordered try: es.indices.refresh(index=es_params['index']) res = helpers.scan(es, query=query, index=es_params['index']) return res except: log.error('Sync: %s - Step: %s - Problem getting data' % (sync_params['name'], sys._getframe().f_code.co_name)) log.error(getError()) return None
def run(self): tuple_ip_port = (self.tuple_ip_id[0], UDP_PORT) while (conection_table.read_is_open(self.tuple_ip_id)): message = self.conn.recv(BUFFER_SIZE - HEADER_SIZE) print("> TCP Received:", message) if not message: pdu_fin = AnonPDU( AnonHeader(PDU_State.END_CONNECTION, PDU_Flags.FIN, id=self.tuple_ip_id[1])) pdu_fin.encrypt(self.aes_key) print(" > Sent FIN") #self.conn.close() conection_table.unverified_put(self.tuple_ip_id, pdu_fin) udp_socket.sendto(pdu_fin.get_bytes(), tuple_ip_port) # Wait for ack process #begin = unix_time_millis(datetime.datetime.now()) #while True: # if conection_table.unverified_empty(self.tuple_ip_id): # # If is empty then the packet received ack in the udp read thread # break # # end = unix_time_millis(datetime.datetime.now()) # if end - begin >= TIMEOUT_MILLIS: # begin = unix_time_millis(datetime.datetime.now()) # print(" > Resent FIN") # udp_socket.sendto(pdu_fin.get_bytes(), tuple_ip_port) # if conection_table.close_read(self.tuple_ip_id) == 1: conection_table.remove(self.tuple_ip_id) else: pdu_msg = AnonPDU(AnonHeader(PDU_State.DATA_EXCHANGE, 0, data_or_key=PDU_DataOrKey.DATA, id=self.tuple_ip_id[1]), payload=message) pdu_msg.encrypt(self.aes_key) # Message encrypt conection_table.unverified_put(self.tuple_ip_id, pdu_msg) udp_socket.sendto(pdu_msg.get_bytes(), tuple_ip_port) # Wait for ack process begin = unix_time_millis(datetime.datetime.now()) while True: if conection_table.unverified_empty(self.tuple_ip_id): # If is empty then the packet received ack in the udp read thread break end = unix_time_millis(datetime.datetime.now()) if end - begin >= TIMEOUT_MILLIS: begin = unix_time_millis(datetime.datetime.now()) udp_socket.sendto(pdu_msg.get_bytes(), tuple_ip_port) print("TCP_Read_UDP_Write End")
def get_changing_information(): now = datetime.now() percentage = 0 speed = 0 finishing_time = 0 ETA = 0 log_file = get_latest_log_file_name(datetime.now()) if os.path.isfile(log_file): with open(log_file, 'r') as file_handler: value = utils.tail(file_handler, 1) if len(value) > 0: line = value[0] if 'ETA' in line: percentage = float(line[line.find(']') + 2:line.find('%')]) speed = line[line.find('at ') + 3:line.find(' ETA')] ETA = line[line.find('ETA ') + 4:] if len(ETA) > 5: time = datetime.strptime(ETA, '%H:%M:%S') endTime = timedelta(hours=time.hour, minutes=time.minute, seconds=time.second) else: time = datetime.strptime(ETA, '%M:%S') endTime = timedelta(minutes=time.minute, seconds=time.second) finishing_time = utils.unix_time_millis(now + endTime) changing_video_information = [percentage, speed, finishing_time, ETA] return changing_video_information
def get_comments(self): commentQuery = self.db.member_comments.member_id == self.memberID # WE ARE USING THE ABOVE DECLARED QUERY AS A FILTER ON THE DB, NOTICE TABLE IS DECLARED IN THE FILTER commentRows = self.db(commentQuery).select() if commentRows: from utils import unix_time_millis memberComments = [] for comment in commentRows: thisComment = { "made_by": False, "created": unix_time_millis(comment.create_date) if comment.create_date else comment.create_date, "comment": comment.comment } # SINCE WE ARE USING ROW ID, WE DO NOT NEED A QUERY, ON A FILTER AT THE TABLE LEVEL user = self.db.auth_user(comment.staff_id) if user: thisComment[ "made_by"] = user.first_name + " " + user.last_name memberComments.append(thisComment) return memberComments return False
def main(): num_problems = int(raw_input("How many problems would you like to create? ")) for x in xrange(num_problems): problemdb.add_problem(generate_problem_name(), generate_sentence(), generate_hint(), generate_category(), generate_points(), "flag") num_teams = int(raw_input("How many teams would you like to create? ")) for x in xrange(num_teams): base_time = datetime.datetime.utcnow() + datetime.timedelta(minutes=-(60*num_problems)) solved = [] team_name = generate_team_name() teamdb.add_team(team_name, "password", str(utils.unix_time_millis(base_time))) problems = problemdb.get_problems() for x in xrange(random.randint(1, num_problems)): pid = random.choice(problems)[0] if pid not in solved: new_base = random_date(base_time, base_time + datetime.timedelta(minutes=60)) problemdb.submit_flag(team_name, pid, "flag", str(utils.unix_time_millis(new_base))) base_time = new_base solved.append(pid)
def _createElasticSearchData(self, config, curr_time=None, amount=None, start=0): """ Creates data on an index on Elasticsearch :param config: configuration :param curr_time: time to create data :param amount: amount to be created :param start: first id :return:the response from bulk insertion """ params = config['elasticsearch'] es = self.elastic['session'] es.indices.create(index=params['index'], ignore=400) if not curr_time: curr_time = datetime.utcnow() if not amount: amount = self.data_amt # create data data = [] for i in range(start, start + amount): action = dict(_type=params['type'], _id=str(self.idList[i]), _version_type='external', _version=unix_time_millis(curr_time), _source={ 'text': id_generator(10), 'source': 'Elastic', 'date': curr_time, 'version': unix_time_millis(curr_time) }) data.append(action) # write to elasticsearch ret = bulk(es, data, chunk_size=700, index=params['index']) es.indices.flush(index=params['index']) return ret
def exercise_detail(request, exercise_id): try: exercise = Exercise.objects.get(pk=exercise_id) except Exercise.DoesNotExist: raise Http404 weight_rep_data = [] all_sets = exercise.set_set.all() for exercise_set in all_sets: weight_rep_data.append([unix_time_millis(exercise_set.session.date) , float(exercise_set.weight_resistance) , float(exercise_set.reps_mins)]) exercise_detail_graph = generate_exercise_detail_graph(weight_rep_data, str(exercise.name)) return render(request, 'workouts/exercise_detail.html', {'exercise':exercise, 'all_sets':all_sets, 'exercise_detail_graph':exercise_detail_graph})
def get_cassandra_latest(self, sync_params): """ Get a cassandra family column latests updates :return: rows that are new or updated """ # helpers session = self.cassandra['session'] params = sync_params['cassandra'] # construct the query and run it stmt = '''SELECT {fields_list} FROM {table} ''' stmt = stmt.format(fields_list=(params.get('fields_list', '*')), table=params['table']) # check if could filter if sync_params.get('filter_date', None): stmt += '''WHERE {version_col} > {time_last_run} AND {version_col} <= {time_this_run} ALLOW FILTERING''' stmt = stmt.format( version_col=sync_params['version_col'], time_last_run=unix_time_millis(self.time_last_run), time_this_run=unix_time_millis(self.time_this_run)) try: rows = session.execute(stmt) except: log.error('Sync: %s - Step: %s - Table: %s Problem getting data' % (sync_params['name'], sys._getframe().f_code.co_name, params['table'])) log.error(getError()) return None return rows
def _createCassandraData(self, config, curr_time=None, create_table=True, amount=None, start=0): """ Will create 'tables' and populate data :param config: configuration :param curr_time: time to create data :param create_table: if the table should be created :param amount: amount to be created :param start: first id :return: """ session = self.cassandra['session'] keyspace = self.cassandra['keyspace'] if not curr_time: curr_time = datetime.utcnow() if not amount: amount = self.data_amt params = config['cassandra'] # create a table if create_table: stmt = ''' CREATE TABLE {keyspace}.{table} ( {id_col} UUID, {version_col} bigint, text varchar, source varchar, {date_col} timestamp, PRIMARY KEY ({primary_key}) );''' # check if it will use date as a filter if config.get('filter_date', None): primary_key = '%s, %s' % (config['id_col'], config['version_col']) else: primary_key = config['id_col'] stmt = stmt.format(keyspace=keyspace, table=params['table'], version_col=config['version_col'], id_col=config['id_col'], date_col=config['date_col'], primary_key=primary_key) session.execute(stmt) if config.get('filter_date', None): session.execute( 'CREATE INDEX ON %s.%s (%s)' % (keyspace, params['table'], config['version_col'])) # Prepare the statements stmt = "INSERT INTO {keyspace}.{table} (" \ "{id_col}, " \ "{version_col}, " \ "text, " \ "source, " \ "{date_col}) " \ "VALUES (?, ?, ?, ?, ?)" \ "USING TIMESTAMP ? " stmt = stmt.format(keyspace=keyspace, table=params['table'], version_col=config['version_col'], id_col=config['id_col'], date_col=config['date_col']) data_statement = session.prepare(stmt) # add the prepared statements to a batch count = 0 batch = BatchStatement() for i in range(start, start + amount): batch.add(data_statement, [ self.idList[i], unix_time_millis(curr_time), id_generator(10), 'CASSANDRA', unix_time_millis(curr_time), unix_time_millis(curr_time) ]) count += 1 # every x records, commit. The parameter 65000 was giving timeout if (count % 5000) == 0: # execute the batch session.execute(batch) # hack to get around the 65k limit of python driver batch._statements_and_parameters = [] count = 0 if count > 0: session.execute(batch) batch._statements_and_parameters = []
def insert_cassandra(self, sync_params, rows): """ Insert data into :rtype : object :param sync_params: :param rows: :return: """ # helpers session = self.cassandra['session'] params = sync_params['cassandra'] keyspace = self.cassandra['keyspace'] # get the table schema and order so that we can insert on query in correct order schema = self._get_table_schema(keyspace, params['table']) if not schema: return None, None cols = schema.keys() cols.sort() # Prepare the statements stmt = "INSERT INTO {keyspace}.{table} (" stmt += ", ".join(['%s' % k for k in cols]) stmt += ") VALUES (" stmt += ", ".join([':' + k for k in cols]) stmt += ") USING TIMESTAMP :p_timestamp " stmt = stmt.format(keyspace=keyspace, table=params['table']) try: data_statement = session.prepare(stmt) except: log.error('Sync: %s - Step: %s - Problem inserting data' % (sync_params['name'], sys._getframe().f_code.co_name)) log.error(getError()) return None, None # add the prepared statements to a batch count = 0 total = 0 errors = 0 batch = BatchStatement() cols.remove(sync_params['id_col']) for row in rows: # convert to the cassandra structure try: # fill the data dictionary and put none on columns that are not present data = {} source = row['_source'] for col in cols: data[col] = source.get(col, None) date = datetime.strptime(source[sync_params['date_col']], '%Y-%m-%dT%H:%M:%S.%f') data[sync_params['id_col']] = uuid.UUID(row['_id']) data[sync_params['date_col']] = unix_time_millis(date) data['p_timestamp'] = data['version'] batch.add(data_statement, data) count += 1 except: log.error('Problem converting data {}'.format(row['_id'])) log.error(getError()) continue # every x records, commit. There is a limitation on the driver if (count % 5000) == 0: try: # execute the batch session.execute(batch) total += count except: exc_info = sys.exc_info() log.error(exc_info[1]) log.error(exc_info[2]) errors += count count = 0 # hack to get around the 65k limit of python driver batch._statements_and_parameters = [] if count > 0: try: # execute the batch session.execute(batch) total += count except: log.error( 'Sync: %s - Step: %s - Problem inserting data' % (sync_params['name'], sys._getframe().f_code.co_name)) log.error(getError()) errors += count return total, errors
def convert_hca_json_to_magetab(mode, data_dir, project_uuids_filter=None, new_only=True, sender=None, email_recipients=None): # Retrieve the HCA Json to MAGETAB translation config config = utils.get_config(process_name) idf_config = utils.get_val(config, 'idf') sdrf_config = utils.get_val(config, 'sdrf') logger = utils.create_logger(data_dir, process_name, mode) hca_api_url_root = utils.get_val(config, 'hca_api_url_root') # already_imported_project_uuids will be excluded from the import (and their json will not be cached) if new_only: already_imported_project_uuids = utils.get_previously_imported_projects( data_dir) else: already_imported_project_uuids = [] project_uuids = hcadam.get_hca_project_uuid_to_import( hca_api_url_root, config, mode, project_uuids_filter, already_imported_project_uuids, logger) # project_uuid2gxa_accession dict forms the worklist of experiments to be imported from HCA project_uuid2gxa_accession = {} for project_uuid in project_uuids: project_uuid2gxa_accession[ project_uuid] = hcadam.get_gxa_accession_for_project_uuid( project_uuid, config) project_uuid2gxa_accession = utils.resolve_gxa_accession_for_project_uuid( data_dir, project_uuid2gxa_accession) # Experiments imported from HCA DCC - for email report imported_experiments = [] # Log experiments to be imported logger.info("About to import from HCA DCC the following experiments:") for project_uuid in project_uuid2gxa_accession.keys(): logger.info("%s -> %s" % (project_uuid, project_uuid2gxa_accession[project_uuid])) # Metadata retrieve starts here for project_uuid in project_uuid2gxa_accession.keys(): time_start = utils.unix_time_millis(datetime.now()) accession = project_uuid2gxa_accession.get(project_uuid) if new_only: # N.B. if new_only is True, HCA projects for which an idf file in data_dir doesn't exist will be imported idf_file_path = '%s/%s.idf.txt*' % (data_dir, accession) if glob.glob(idf_file_path): logger.info( "Not importing %s as %s already exists (new_only mode: %s)" % (accession, idf_file_path, str(new_only))) continue else: logger.info( 'About to translate json for HCA study uuid: %s to magetab for gxa accession: %s' % (project_uuid, accession)) # Retrieve all HCA json content for project_uuid hca_json_for_project_uuid = hcadam.get_json_for_project_uuid( project_uuid) # Initialise SDRF-related data structures and flags # Set of technologies found in bundles for a given project uuid. The presence of a technology name in that set acts as a flag that sdrf column headers have been collected for that technology. technologies_found = set([]) # List of SDRF column headers (per technology) that will be output in each (technology-specific) sdrf file technology2sdrf_column_headers = {} # List value corresponding to each technology key in technology2rows dict will be used to accumulate rows of data to be output into the generated (technology-specific) SDRF file technology2rows = {} # For a given technology key, after all the bundles for a given project have been seen, the value (set) indexes of sdrf columns that are empty for this technology # (and therefore will be removed before the sdrf matrix is output into the sdrf file) # N.B. Before any bundles are seen, all columns are assumed to be empty until at least one value is encountered for each. technology2indexes_of_empty_columns = {} # Initialise IDF-related data structures (for data such as protocols - that need to be collected from all the bundles) # technology2protocol_type2protocols is used to store all protocol names - to be populated later inside IDF file technology2protocol_type2protocols = {} # technology2protocol_type2max_protocol_num_per_sample stores maximum number of protocols per technology-protocol_type in any given sample/bundle. # This number will dictate how many 'Protocol REF' columns should be output for that protocol_type in sdrf file for that technology technology2protocol_type2max_protocol_num_per_sample = {} # characteristic_values_in_bundle dict stores sets of (unique) values per characteristic found - in order # to later automatically generate the corresponding Factors - for all characteristics for which the values change across the experiment (across all technologies). # N.B. A simplifying assumption is made here that in a multi-technology experiment, each technology-specific portion will get the same Factors characteristic_values = OrderedDict() # Auxiliary counter - used to limit number of HCA bundles processed during testing bundle_cnt = 0 for bundle_url in hca_json_for_project_uuid.keys(): # We want to warn of missing fields for the first bundle (since each bundle will contain some technology), the test below # effectively checks if we're dealing with the first bundle or not warn_of_missing_fields = not technologies_found hca_json_for_bundle = hca_json_for_project_uuid[bundle_url] context = (accession, project_uuid, bundle_url) #################################################### #### Collect protocols for IDF from bundle_uuid #### #################################################### protocol_type2protocols_in_bundle = OrderedDict([]) for protocol_key in utils.get_val(config, 'protocol_types'): protocol_type2protocols_in_bundle[protocol_key] = OrderedSet( []) for schema_type in list(hca_json_for_bundle.keys()): if re.search(r"" + protocol_key, schema_type): for protocol_json in hca_json_for_bundle[schema_type]: protocol_name = utils.get_hca_value( utils.get_val(config, 'hca_protocol_name_path'), protocol_json, logger, config, False, 'Protocol Name', context) if protocol_name != utils.get_val( config, 'notfound'): protocol_description = utils.get_hca_value( utils.get_val( config, 'hca_protocol_description_path'), protocol_json, logger, config, False, 'Protocol Description', context) protocol_type = utils.get_hca_value( utils.get_val(config, 'hca_protocol_type_path'), protocol_json, logger, config, False, 'Protocol Type', context) protocol_type2protocols_in_bundle[ protocol_key].add( (protocol_name, protocol_description, protocol_type)) ################## ###### SDRF ###### ################## technology = None # Set of indexes of sdrf columns with non-empty sdrf columns for the current bundle_uuid indexes_of_non_empty_sdrf_columns = set([]) # Output one SDRF row per each donor - sequence file tuple in the bundle # Assumptions relating to donors: # 1. Every HCA bundle has at least one json object for both: donor_organism and cell_suspension # 2. When multiple donor_organism and cell_suspension json objects exist, in the lists of json objects for donor_organism and # cell_suspension respectively, the first JSON in donor_organism list corresponds to the first JSON in the cell_suspension list, and so on. # However, in multi-donor samples with just one cell_suspension json object (e.g. project_uuid: d96c2451-6e22-441f-a3e6-70fd0878bb1b, # bundle_url: https://dss.data.humancellatlas.org/v1/bundles/fb64e4f9-9a24-4a6a-856f-2b7c0d4f309d?version=2019-01-03T153203.452910Z&replica=aws # that single cell_suspension json is assumed to apply to all donor_organism json objects in that bundle. donor_json_list = hca_json_for_bundle[utils.get_val( config, 'hca_donor_organism')] cell_suspension_json_list = hca_json_for_bundle[utils.get_val( config, 'hca_cell_suspension')] if len(cell_suspension_json_list) != len(donor_json_list) and len( cell_suspension_json_list) != 1: err_msg = " Project: %s bundle: %s contain multiple donor_organism and cell_suspension jsons, but their number is not the same" % ( project_uuid, bundle_url) logger.error(err_msg) raise utils.HCA2MagetabTranslationError(err_msg) i = 0 while i < len(donor_json_list): donor_json = donor_json_list[i] if len(cell_suspension_json_list) > 1: cell_suspension_json = cell_suspension_json_list[i] else: cell_suspension_json = cell_suspension_json_list[0] i += 1 for datafile_json in hca_json_for_bundle[utils.get_val( config, 'hca_sequence_file')]: sdrf_column_headers = [] current_row = [] for line_item in sdrf_config: magetab_label = line_item[0] hca_path = line_item[1] if isinstance(hca_path, list): if not utils.position_valid_for_sdrf_column( magetab_label, sdrf_column_headers, config): # Skip sdrf columns if the position in which they would be inserted would not be valid given the column just before: sdrf_column_headers[-1] continue elif magetab_label in [ 'Characteristics[geographical location]', 'Characteristics[genotype]' ]: # Special handling/parsing - geographical location - multiple json files need checking for field presence value = utils.get_val(config, 'notfound') regex = hca_path[0] for schema_type in list( hca_json_for_bundle.keys()): if re.search(r"" + regex, schema_type): for json_dict in hca_json_for_bundle[ schema_type]: value = utils.get_hca_value( hca_path[1:], json_dict, logger, config, warn_of_missing_fields, magetab_label, context) if value != utils.get_val( config, 'notfound'): break utils.add_to_row( indexes_of_non_empty_sdrf_columns, sdrf_column_headers, magetab_label, value, current_row, characteristic_values, config) elif magetab_label == 'Protocol REF': protocol_type = hca_path[0] # TODO: # Note that before sdrf is output, we will need to split protocol_ids into separate columns, but not before processing all the bundles in the project - we have to wait till the # end to we know how many protocols per technology-protocol type we have. _In theory_ we could have 3 different enrichment protocols for a given technology in a project, e.g. # FACS3, FACS5 and FACS8, and for the current bundle protocol_ids = 'FACS3,FACS8'. Then before outputting sdrf we will have to 'explode' the 'Protocol REF' column corresponding # to the enrichment protocol into 3 (tab-delimited) new columns - and these columns for the current bundle_uuid will have values: 'FACS3\tFACS8\t' and # headers: 'Protocol REF\tProtocol REF\tProtocol REF' protocol_ids = ','.join([ x[0] for x in list( protocol_type2protocols_in_bundle[ protocol_type]) ]) utils.add_to_row( indexes_of_non_empty_sdrf_columns, sdrf_column_headers, magetab_label, protocol_ids, current_row, characteristic_values, config) elif len(hca_path) > 0 and re.search( r"" + utils.get_val( config, 'hca_protocol_schema_regex'), hca_path[0]): protocol_type = hca_path[0] # Special handling/parsing - for a given protocol_type, various protocol-related information needs to be collected from potentially multiple HCA json files values = set([]) for schema_type in [ x for x in hca_json_for_bundle.keys() if x == protocol_type ]: for json_dict in hca_json_for_bundle[ schema_type]: value = utils.get_hca_value( hca_path[1:], json_dict, logger, config, warn_of_missing_fields, magetab_label, context) if value != utils.get_val( config, 'notfound'): if magetab_label == 'Comment[library construction]': # Capture technology for the current bundle hca_technology = value.lower() technology = utils.get_gxa_technology( hca_technology, config) value = technology values.add(str(value)) utils.add_to_row( indexes_of_non_empty_sdrf_columns, sdrf_column_headers, magetab_label, ', '.join(values), current_row, characteristic_values, config) elif magetab_label == 'Comment[HCA bundle url]': utils.add_to_row( indexes_of_non_empty_sdrf_columns, sdrf_column_headers, magetab_label, bundle_url, current_row, characteristic_values, config) elif magetab_label in [ 'Comment[RUN]', 'Comment[FASTQ_URI]', 'Scan Name', 'Comment[technical replicate group]', 'Comment[HCA file uuid]' ]: # Special handling/parsing - Comment[RUN] - datafile_key json file need checking for field presence value = utils.get_hca_value( hca_path, datafile_json, logger, config, warn_of_missing_fields, magetab_label, context) if magetab_label == 'Comment[RUN]': # NB. We're stripping e.g. _2.fastq.gz from the end - to retain just the core file name # Tested on the following types of file names: # "FCA7167226_I1.fastq.gz", "MantonBM7_HiSeq_4_S19_L005_R2_001.fastq.gz", "E18_20160930_Neurons_Sample_57_S054_L005_I1_010.fastq.gz", "FCA7167226.fastq.gz" value = re.sub( r"(\_\w\d|\_\w\d\_\d+|\_\d)*\.f\w+\.gz", "", value) utils.add_to_row( indexes_of_non_empty_sdrf_columns, sdrf_column_headers, magetab_label, value, current_row, characteristic_values, config) else: schema_type = hca_path[0] if schema_type in hca_json_for_bundle: if schema_type == utils.get_val( config, 'hca_donor_organism'): json_dict = donor_json elif schema_type == utils.get_val( config, 'hca_cell_suspension'): json_dict = cell_suspension_json else: # Retrieving the first element below follows the assumption of one single json object in schema_type in a bundle # (all the special cases were handled above) json_dict = hca_json_for_bundle[ schema_type][0] value = utils.get_hca_value( hca_path[1:], json_dict, logger, config, warn_of_missing_fields, magetab_label, context) else: value = utils.get_val(config, 'notfound') if magetab_label in \ ['Characteristics[organism]', 'Characteristics[disease]', 'Characteristics[cell subtype]', 'Characteristics[ethnic group]','Characteristics[strain]'] \ and value != utils.get_val(config, 'notfound'): # Special handling/parsing - organism, disease - could be multiple according to HCA schema utils.add_to_row( indexes_of_non_empty_sdrf_columns, sdrf_column_headers, magetab_label, ','.join([x['text'] for x in value]), current_row, characteristic_values, config) else: # magetab_label is not a list or a special case utils.add_to_row( indexes_of_non_empty_sdrf_columns, sdrf_column_headers, magetab_label, str(value), current_row, characteristic_values, config) else: # hca_path is not a list - add to the row as is utils.add_to_row(indexes_of_non_empty_sdrf_columns, sdrf_column_headers, magetab_label, hca_path, current_row, characteristic_values, config) # At least one bundle has been seen - the SDRF columns have now been determined if technology: # Append current_row to the list of rows in the SDRF file being generated if technology not in technology2rows.keys(): technology2rows[technology] = [] technology2rows[technology].append(current_row) # The presence of a technology name in that set acts as a flag that sdrf column headers have been collected for that technology. if technology not in technologies_found: technology2sdrf_column_headers[ technology] = sdrf_column_headers # To start off with assume all columns are empty technology2indexes_of_empty_columns[ technology] = range(len(sdrf_config)) # Initialise technology2protocol_type2protocols with new technology technology2protocol_type2protocols[ technology] = OrderedDict() technologies_found.add(technology) # Store (without duplicates) for technology the protocols found for bundle_uuid (i.e. those in protocol_type2protocols_in_bundle) for protocol_type in protocol_type2protocols_in_bundle.keys( ): num_protocols_in_bundle = len( protocol_type2protocols_in_bundle[ protocol_type]) if num_protocols_in_bundle > 0: if technology not in technology2protocol_type2max_protocol_num_per_sample.keys( ): technology2protocol_type2max_protocol_num_per_sample[ technology] = OrderedDict({ protocol_type: num_protocols_in_bundle }) elif protocol_type not in technology2protocol_type2max_protocol_num_per_sample[ technology].keys(): technology2protocol_type2max_protocol_num_per_sample[ technology][ protocol_type] = num_protocols_in_bundle else: technology2protocol_type2max_protocol_num_per_sample[ technology][protocol_type] = max( num_protocols_in_bundle, technology2protocol_type2max_protocol_num_per_sample[ technology][protocol_type]) if protocol_type not in technology2protocol_type2protocols[ technology].keys(): technology2protocol_type2protocols[ technology][ protocol_type] = OrderedSet([]) # Merge set: protocol_type2protocols_in_bundle[protocol_type] into set already in technology2protocol_type2protocols[technology][protocol_type] technology2protocol_type2protocols[technology][ protocol_type] |= protocol_type2protocols_in_bundle[ protocol_type] else: err_msg = "Failed to retrieve valid technology from value: \"%s\" in bundle: %s" % ( hca_technology, bundle_url) logger.error(err_msg) raise utils.HCA2MagetabTranslationError(err_msg) # Now remove from technology2indexes_of_empty_columns[technology] all column indexes we found non-empty values for, for the current bundle_uuid technology2indexes_of_empty_columns[technology] = [ x for x in technology2indexes_of_empty_columns[technology] if x not in indexes_of_non_empty_sdrf_columns ] # Number of bundles processed per study - test mode cut-off if mode == 'test' and bundle_cnt >= utils.get_val( config, 'test_max_bundles'): break # Now work out which Characteristics should be auto-generated as Factors also technology2factors = {} # Assumption - in experiments imported from HCA DCC, only one column for a unique characteristic name will be output in the resulting SDRF file technology2factor2characteristic_colnum = {} for technology in technologies_found: technology2factors[technology] = [] technology2factor2characteristic_colnum[technology] = {} for characteristic in characteristic_values: if characteristic in technology2sdrf_column_headers[ technology] and len( characteristic_values[characteristic]) > 1: factor = re.sub("Characteristics", "FactorValue", characteristic) technology2factors[technology].append(factor) technology2sdrf_column_headers[technology].append(factor) # Store index (in each sdrf row) of the characteristic corresponding factor, so that we know where to get the value from # when populating factor values in sdrf later technology2factor2characteristic_colnum[technology][ factor] = technology2sdrf_column_headers[ technology].index(characteristic) # Add Factor for single cell identifier (smart-seq2 experiments only) smart_regex = re.compile('smart-.*$') if smart_regex.match(technology): factor = 'FactorValue[single cell identifier]' technology2sdrf_column_headers[technology].append(factor) technology2factors[technology].append(factor) technology2factor2characteristic_colnum[technology][ factor] = technology2sdrf_column_headers[technology].index( 'Source Name') # For each technology, write out the generated SDRF file. # N.B. IF the HCA project is multi-technology, append the technology label to the end of the sdrf file name multi_technology_hca_project = len(technologies_found) > 1 for technology in technologies_found: sdrf_file_name = "%s.sdrf.txt" % accession if multi_technology_hca_project: sdrf_file_name = "%s.%s" % (sdrf_file_name, technology) with open(os.path.join(data_dir, sdrf_file_name), 'wb') as f: csvwriter = csv.writer(f, delimiter='\t', encoding='utf-8', escapechar='\\', quotechar='', lineterminator='\n', quoting=csv.QUOTE_NONE) # Remove from technology2sdrf_column_headers[technology] headers of columns that are empty for this technology utils.remove_empty_columns( technology2sdrf_column_headers[technology], technology2indexes_of_empty_columns[technology]) # Expand protocol column headers to account for multiple protocols per protocol_type, if applicable expanded_headers = technology2sdrf_column_headers[ technology].copy() utils.expand_protocol_columns( None, expanded_headers, technology2protocol_type2max_protocol_num_per_sample[ technology], logger) # Write out sdrf header line csvwriter.writerow(expanded_headers) for row in technology2rows[technology]: # Append to row values for all the auto-generated factors for factor in technology2factors[technology]: row.append(row[technology2factor2characteristic_colnum[ technology][factor]]) # Remove from row elements in positions corresponding to columns that are empty for this technology utils.remove_empty_columns( row, technology2indexes_of_empty_columns[technology]) # Expand protocol values into multiple columns to account for multiple protocols per protocol_type, if applicable utils.expand_protocol_columns( row, technology2sdrf_column_headers[technology], technology2protocol_type2max_protocol_num_per_sample[ technology], logger) # Write out sdrf data line csvwriter.writerow(row) ################# ###### IDF ###### ################# for technology in technologies_found: idf_file_name = "%s.idf.txt" % accession if multi_technology_hca_project: idf_file_name = "%s.%s" % (idf_file_name, technology) with open(os.path.join(data_dir, idf_file_name), 'wb') as f: csvwriter = csv.writer(f, delimiter='\t', encoding='utf-8', escapechar='\\', quotechar='', lineterminator='\n', quoting=csv.QUOTE_NONE) for line_item in idf_config: magetab_label = line_item[0] hca_path = line_item[1] if isinstance(hca_path, list): if magetab_label in [ 'Term Source Name', 'Term Source File' ]: # Special handling/parsing - hca_path is a list of literal values, rather than locations in HCA json files csvwriter.writerow([magetab_label] + hca_path) continue if hca_path: # Note the assumption that only one project_json object exists per bundle # (c.f. hca_schemas_with_one_json_per_bundle_expected in hca2mtab.yml) json_dict = hca_json_for_bundle[hca_path[0]][0] value = utils.get_hca_value(hca_path[1:], json_dict, logger, config, True, magetab_label, context) if magetab_label in [ 'Public Release Date' ] and value != utils.get_val(config, 'notfound'): # Special handling/parsing - Public Release date, Comment[HCALastUpdateDate], Comment[HCAReleaseDate] m = re.search(r'^(\d{4}\-\d{2}\-\d{2}).*$', value) if m: value = m.group(1) else: logger.error( "Failed to parse date out of: %s" % value) value = '' csvwriter.writerow([magetab_label, value]) elif magetab_label in [ 'Comment[ExpressionAtlasAccession]', 'SDRF File' ]: # Special handling/parsing - use previously derived accession value = accession if magetab_label == 'SDRF File': # SDRF file name - derive from experiment accession value = re.sub(r'\.idf\.', '.sdrf.', idf_file_name) candidate_acc_regex_obj = re.compile('E-CAND-\d+') if magetab_label == 'SDRF File' or ( magetab_label == 'Comment[ExpressionAtlasAccession]' and not candidate_acc_regex_obj.match( accession)): csvwriter.writerow([magetab_label, value]) elif magetab_label in ['Comment[HCALastUpdateDate]']: csvwriter.writerow([ magetab_label, datetime.now().strftime("%Y-%m-%d") ]) elif magetab_label == 'Comment[SecondaryAccession]': # Special handling - secondary accessions secondary_accessions = OrderedSet([]) for label in utils.get_val( config, 'hca_old_secondary_accessions_labels'): hca_project_json = hca_json_for_bundle[ utils.get_val(config, 'hca_project')] if label in hca_project_json: secondary_accessions.add( hca_project_json[label]) # For the reason for the loop below see a comment near hca_old_secondary_accessions_labels in hca2mtab.yml for label in utils.get_val( config, 'hca_new_secondary_accessions_labels'): if label in hca_project_json: for secondary_accession in hca_project_json[ label]: secondary_accessions.add( secondary_accession) # Now append the HCA study uuid secondary_accessions.add(project_uuid) if len(secondary_accessions) > 0: csvwriter.writerow( ['Comment[SecondaryAccession]'] + list(secondary_accessions)) elif magetab_label in [ 'Experimental Factor Name', 'Experimental Factor Type' ]: # Special handling - populate factors that where auto-generated in SDRF above idf_line = [magetab_label] for factor in technology2factors[technology]: m = re.search(r'\[(.*)\]', factor) if m: idf_line.append(m.group(1)) else: err_msg = "Failed to extract Factor name from %s" % factor logger.error(err_msg) raise utils.HCA2MagetabTranslationError( err_msg) csvwriter.writerow(idf_line) elif isinstance(magetab_label, list): if re.search('Person Last Name', magetab_label[0]): # Special handling/parsing - Contributors contact_rows = OrderedDict() for row_label in magetab_label: contact_rows[row_label] = [] for contact in utils.get_hca_value( hca_path[1:], json_dict, logger, config, True, magetab_label, context): contact_name_arr = contact[ 'contact_name'].split(',') contact_rows['Person Last Name'].append( contact_name_arr[0]) contact_rows['Person First Name'].append( contact_name_arr[-1].lstrip()) if len(contact_name_arr) == 3: contact_rows[ 'Person Mid Initials'].append( contact_name_arr[1]) for contact in utils.get_hca_value( hca_path[1:], json_dict, logger, config, True, magetab_label, context): email = utils.get_hca_value( ['email'], contact, logger, config, True, magetab_label, context) contact_rows['Person Email'].append( email if email != utils. get_val(config, 'notfound') else '') contact_rows['Person Affiliation'].append( contact['institution']) for contact in utils.get_hca_value( hca_path[1:], json_dict, logger, config, True, magetab_label, context): address = utils.get_hca_value( ['address'], contact, logger, config, True, magetab_label, context) contact_rows['Person Address'].append( address if address != utils. get_val(config, 'notfound') else '') for key in list(contact_rows.keys()): csvwriter.writerow([key] + contact_rows[key]) elif 'Protocol Name' == magetab_label[0]: # Special handling/parsing - Protocols protocol_rows = OrderedDict() for row_label in magetab_label: protocol_rows[row_label] = [] for protocol_type in technology2protocol_type2protocols[ technology].keys(): # Traverse through protocol tuples in alphabetic order - by protocol name for protocol_tuple in sorted( technology2protocol_type2protocols[ technology][protocol_type], key=lambda x: x[0]): protocol_rows['Protocol Name'].append( protocol_tuple[0]) protocol_rows[ 'Protocol Description'].append( protocol_tuple[1] if protocol_tuple[1] != utils. get_val(config, 'notfound') else '') protocol_rows['Protocol Type'].append( protocol_tuple[2] if protocol_tuple[2] != utils.get_val( config, 'notfound') else '') for key in list(protocol_rows.keys()): csvwriter.writerow([key] + protocol_rows[key]) elif re.search('Publication Title', magetab_label[0]): if utils.get_hca_value( hca_path[1:], json_dict, logger, config, True, magetab_label[0], context) == utils.get_val( config, 'notfound'): # Skip the publications-related idf config continue # Special handling/parsing - Publications publication_rows = OrderedDict() for row_label in 'Publication Title', 'Publication Author List', 'PubMed ID', 'Publication DOI': publication_rows[row_label] = [] for publication in utils.get_hca_value( hca_path[1:], json_dict, logger, config, True, magetab_label, context): publication_rows[ 'Publication Title'].append( utils.get_hca_value( utils.get_val( config, 'hca_publication_title_path' ), publication, logger, config, True, magetab_label, context)) publication_rows[ 'Publication Author List'].append( ', '.join( utils.get_hca_value( utils.get_val( config, 'hca_publication_authors_path' ), publication, logger, config, True, magetab_label, context))) pubmed_id = utils.get_hca_value( utils.get_val( config, 'hca_publication_pmid_path'), publication, logger, config, True, magetab_label, context) publication_rows['PubMed ID'].append( str(pubmed_id ) if str(pubmed_id) != utils. get_val(config, 'notfound') else '') publication_doi = utils.get_hca_value( utils.get_val( config, 'hca_publication_doi_path'), publication, logger, config, True, magetab_label, context) publication_rows['Publication DOI'].append( publication_doi if publication_doi != utils. get_val(config, 'notfound') else '') for key in list(publication_rows.keys()): csvwriter.writerow([key] + publication_rows[key]) else: # magetab_label is not a list or a special case csvwriter.writerow([magetab_label, value]) if magetab_label == 'Investigation Title': imported_experiments.append( "%s (%s - %d bundles): %s" % (accession, technology, len(hca_json_for_project_uuid.keys()), value)) else: # hca_path is not a list csvwriter.writerow(line_item) time_end = utils.unix_time_millis(datetime.now()) duration = (time_end - time_start) / 1000 / 60 logger.info( "Processing HCA study uuid: %s for gxa accession: %s took %d mins" % (project_uuid, accession, duration)) if imported_experiments and sender and email_recipients: utils.email_report("New experiments imported from HCA DCC", '\n'.join(imported_experiments), sender, email_recipients)
def compare_weeks(filename, title, df_c, df_h): HMO_num = filename[2:7] print(HMO_num) df_c['avg'] = df_c["PrimT"].mean() df_h['avg'] = df_h["PrimT"].mean() start_time = utils.unix_time_millis(df_c.first_valid_index()) times = df_c.index.to_series().apply( lambda x: dt.datetime.strftime(x, '%A %H:%M')).tolist() trace_hwt_outlet_cold = go.Scatter(x=times, y=df_c.HwTOutlet, name="HW Outlet - Cold Week", connectgaps=True, line=dict(color='#1f77b4')) # HW Outlet is the temperature at the outlet going to the hot water system trace_temp_outlet_cold = go.Scatter(x=times, y=df_c.PrimT, name="Primary Temp - Cold Week", connectgaps=True, line=dict(color='#ff7f0e')) # Primary temperature is the temp measured at the top of the burner that heats up the water trace_boiler_average_cold = go.Scatter(x=times, y=df_c.avg, name="Boiler Average - Cold Week", connectgaps=True, line=dict(color='#2ca02c')) trace_hwt_outlet_hot = go.Scatter(x=times, y=df_h.HwTOutlet, name="HW Outlet - Hot Week", connectgaps=True, line=dict(color='#1f77b4')) # HW Outlet is the temperature at the outlet going to the hot water system trace_temp_outlet_hot = go.Scatter(x=times, y=df_h.PrimT, name="Primary Temp - Hot Week", connectgaps=True, line=dict(color='#ff7f0e')) # Primary temperature is the temp measured at the top of the burner that heats up the water trace_boiler_average_hot = go.Scatter(x=times, y=df_h.avg, name="Boiler Average - Hot Week", connectgaps=True, line=dict(color='#2ca02c')) fig = tools.make_subplots(rows=2, cols=1, specs=[[{}], [{}]], subplot_titles=('HMO Analysis of Cold Week', 'HMO Analysis of Hot Week'), shared_xaxes=True, shared_yaxes=False, vertical_spacing=0.1) fig.append_trace(trace_hwt_outlet_cold, 1, 1) fig.append_trace(trace_temp_outlet_cold, 1, 1) fig.append_trace(trace_boiler_average_cold, 1, 1) fig.append_trace(trace_hwt_outlet_hot, 2, 1) fig.append_trace(trace_temp_outlet_hot, 2, 1) fig.append_trace(trace_boiler_average_hot, 2, 1) fig['layout'].update(title=title, yaxis=dict(title='Temperature'), xaxis=dict(tick0=start_time, nticks=21)) plot(fig, filename=filename, auto_open=False) boiler_temp_threshold = 50 df_c['tag'] = df_c.PrimT > boiler_temp_threshold df_h['tag'] = df_h.PrimT > boiler_temp_threshold time_spent_boiler_c = (df_c.PrimT >= 50).values.sum() / 20 max_time_spent_boiler_c = get_longest_hot_boiler(df_c) time_spent_boiler_h = (df_h.PrimT >= 50).values.sum() / 20 max_time_spent_boiler_h = get_longest_hot_boiler(df_h) cold_week_text = "During the cold week, the boiler was above 50°C for {} hours, with the longest period being {} " \ "hours long, on average it was {}°C ".format(time_spent_boiler_c, max_time_spent_boiler_c, df_c.avg.values[0]) hot_week_text = "During the hot week, the boiler was above 50°C for {} hours, with the longest period being {} " \ "hours long, on average it was {}°C ".format(time_spent_boiler_h, max_time_spent_boiler_h, df_h.avg.values[0]) f = open("./{}/additional_data.txt".format(HMO_num), "a+") f.write(cold_week_text + "\r\n") f.write(hot_week_text + "\r\n") print(cold_week_text) print(hot_week_text)
def plot(filename, title, df): # df2 = df[df.index % 10 == 0] # Selects every 10th row starting from 0 df = df.fillna(method="ffill") print(df.head(20)) if 'Time' in df.columns: df['datetime'] = pd.to_datetime(df['Time']) else: df['datetime'] = pd.to_datetime(df['datetime']) df.index = df['datetime'] start_time = utils.unix_time_millis(df.first_valid_index()) # act_pow = df.ActPow # # print(act_pow[act_pow.notnull()]) # act_pow = act_pow.replace(0, pd.np.nan) # .dropna(axis=0, how='any').fillna(0).astype(int) # print(act_pow.loc[:, (act_pow != 0).any(axis=0)]) # print(act_pow.head(20)) df = df.resample("15T").mean() # df = df.replace(pd.np.nan, 0) # .dropna(axis=0, how='any').fillna(0).astype(int) print(df.head(9)) # commented out because resampling should hopefully get the mean now # hwt_outlet = df.HwTOutlet.rolling(60).mean() # hwt_outlet.drop(hwt_outlet.index[:59], inplace=True) # temp_outlet = df.PrimT.rolling(60).mean() # temp_outlet.drop(temp_outlet.index[:59], inplace=True) # print(hwt_outlet.head(10)) # hwt_set = go.Scatter(x=df["Time"], y=df["HwTSet"], name="HW Setpoint", connectgaps=True) # trace_temp_set = go.Scatter(x=df.index, y=df.PrimTSet, name="Primary Temp Setpoint", connectgaps=True) trace_hwt_outlet = go.Scatter(x=df.index, y=df.HwTOutlet, name="HW Outlet", connectgaps=True) # HW Outlet # is the temperature at the outlet going to the hot water system # trace_act_power = go.Scatter(x=df.index, y=act_pow, name="Actual Power", yaxis='y2') trace_temp_outlet = go.Scatter(x=df.index, y=df.PrimT, name="Primary Temp", connectgaps=True) # Primary # temperature is the temp measured at the top of the burner that heats up the water layout = go.Layout( title=title, yaxis=dict(title='Temperature'), # yaxis2=dict( # title='Load on Boiler', # overlaying='y', # # hoverformat='.0%', # side='right', # showgrid=False, # zeroline=False, # showline=False, # ), xaxis=dict( tick0=start_time, rangeselector=dict(buttons=list([ dict(count=7, label='1w', step='day', stepmode='backward'), dict(count=1, label='1m', step='month', stepmode='backward'), dict(step='all') ])), rangeslider=dict(visible=True), type='date')) plotly.offline.plot( { "data": [trace_hwt_outlet, trace_temp_outlet ], # trace_temp_set, trace_act_power], "layout": layout }, filename=filename, auto_open=False)