def _wait_for_machine_configured(self, file_reader): """In case of nosql and bigdata CMT is changing hostname, wait for that action being complete""" total_sleep_time = 0 wait_for_conf = False for n in self._ctx.node_list: machine_type = file_reader.read_attribute(n.ip_address, 'MACHINE_TYPE') if machine_type == 'manager': wait_for_conf = True break if wait_for_conf: while True: if util.get_hostname() != self._ctx.this_node.hostname: self._logger.debug("Sleep") total_sleep_time += self._ctx.CMT_CONF_WAIT if total_sleep_time >= self._ctx.MAX_CMT_CONF_WAIT: util.log_exception("Waiting for machine configurtion took too long") self.shutdown() time.sleep(self._ctx.CMT_CONF_WAIT) else: # sleep once more before the exit: to make sure that hostname # change propagated time.sleep(self._ctx.CMT_CONF_WAIT) break
def extract_xref(files_list): # total number of files to calculate completion percentage total_files = len(files_list) bad_files_names = [] # Extract all features related to DATA and CODE XREF xref_dict = xref_initialization() for idx, file_name in enumerate(files_list): asm_file = DATASET_DIR + 'train/' + file_name + '.asm.gz' try: get_xref_features(asm_file, xref_dict) except Exception as e: # log corrupted files for future correction log_exception(e, sys.argv[0], asm_file) bad_files_idx.append(idx) bad_files_names.append(file_name) progress_bar(idx+1, total_files, 50) xref_pd = pd.DataFrame.from_dict(xref_dict) # store xref features to avoid recalculation save_obj(xref_pd, 'xref_features') ''' save_obj(bad_files_names, 'bad_asm_files') # drop corrupted files (if any) from the training set if len(bad_files_names) > 0: # log the number of corrupted files logging.info('XREF Feature Extraction completed: ' + str(len(bad_files_names)) + ' file(s) are corrupted.') # store the corrupted files names in 'bad_asm_files.txt' with open('bad_asm_files.txt', 'w') as bfp: for name in bad_files_names: bfp.write(name + '.asm') ''' # save xref features dataframe to csv file to keep results (optional) xref_pd.to_csv('features/xref_features.csv', index=False) return xref_pd
def run(): try: init() if len(sys.argv) > 1: switch(str(sys.argv[1])) else: run_all() except Exception as e: log_exception(e)
def addPlatformPipelineFields(config, metadata, seen_bad_codes, log): fullPlatform2platform = config['uploadfullplatform2platform'] analyte2strategy2moltype = config['analyte2strategy2moltype'] shortname2centername = config['shortname2centername'] try: platformName = fullPlatform2platform[metadata['platform_full_name']] except KeyError, e: print "metadata['platform_full_name']" pprint.pprint(metadata['platform_full_name']) log_exception(log, 'KeyError in fullPlatform2platform[metadata["platform_full_name"]]: %s' % str(e))
def _slave_loop(self, a_node_list): self._logger.info("Slave Loop start") while self._continue: try: self._sync_collections(a_node_list) if self._get_master_count(self._ctx.node_master_timeout) == "TOO_LOW": break if self._ctx.my_master != self._ctx.master_list[0]: self._assign_master(self._ctx.master_list[0]) except: util.log_exception(sys.exc_info()) self.shutdown()
def read_attribute(self, ip_address, attr_type): try: attr_pos = self.node_attributes.index(attr_type) attr_ip_pos = self.node_attributes.index('IP_ADDRESS') with open(self.nodelist_file, 'r') as f: for line in f.readlines(): if len(line) > 1: node_data_list = line.split(None) if ip_address == node_data_list[attr_ip_pos]: return node_data_list[attr_pos] except: util.log_exception(sys.exc_info())
def get_type_system_diagram(project_id): result = {} try: type_system_diagram = models.get_type_system_diagram(project_id) result["resultOK"] = True result["result"] = type_system_diagram except Exception as e: result["resultOK"] = False result["message"] = str(Exception) log_exception(e) return dumps(result, ensure_ascii=False)
def addPlatformPipelineFields(config, metadata, seen_bad_codes, log): fullPlatform2platform = config['uploadfullplatform2platform'] analyte2strategy2moltype = config['analyte2strategy2moltype'] shortname2centername = config['shortname2centername'] try: platformName = fullPlatform2platform[metadata['platform_full_name']] except KeyError, e: print "metadata['platform_full_name']" pprint.pprint(metadata['platform_full_name']) log_exception( log, 'KeyError in fullPlatform2platform[metadata["platform_full_name"]]: %s' % str(e))
def main(): train_labels = pd.read_csv(DATASET_DIR + 'trainLabels.csv') files_list = train_labels['Id'].tolist() # total number of files to calculate completion percentage total_files = len(files_list) # do not count corrupted files bad_files_idx = [] bad_files_names = [] # Extract all features related to DATA and CODE XREF xref_dict = xref_initialization() for idx, file_name in enumerate(files_list): asm_file = DATASET_DIR + 'train/' + file_name + '.asm.gz' try: get_xref_features(asm_file, xref_dict) except Exception as e: # log corrupted files for future correction log_exception(e, sys.argv[0], asm_file) bad_files_idx.append(idx) bad_files_names.append(file_name) progress_bar(idx+1, total_files, 50) xref_pd = pd.DataFrame.from_dict(xref_dict) # store xref features to avoid recalculation save_obj(xref_pd, 'xref_features') save_obj(bad_files_names, 'bad_files') # concat features with classes and IDs to create the dataset data = pd.concat([train_labels, xref_pd], axis=1, sort=False) # drop corrupted files (if any) from the training set if len(bad_files_idx) > 0: data.drop(data.index[bad_files_idx], inplace=True) data = data.reset_index(drop=True) # log the number of corrupted files logging.info('XREF Feature Extraction completed: ' + str(len(bad_files_idx)) + ' file(s) are corrupted.') # store the corrupted files names in 'bad_asm_files.txt' with open('bad_asm_files.txt', 'w') as bfp: for name in bad_files_names: bfp.write(name + '.asm.gz') # save xref features dataframe to csv file to keep results (optional) data.to_csv('results/xref_features.csv') '''
def _get_master_count(self, heartbeat_periods=1): """Listens to master heartbeat signals. Depending on of number of received signals, a decision is made on how to proceed: - In case of too small number of signals, the node attempts to be itself a master. - In case of too big number of signals, if the node is a slave, it checks if it should itself run as a slave. """ self._logger.debug("_get_master_count ENTER") ret = "FINE" self._ctx.heartbeats_received = 0 self._ctx.master_list[:] = [] # Sleep, count masters when awake self._logger.debug("_get_master_count sleep") time.sleep(heartbeat_periods * self._ctx.heartbeat_period) print("_get_master_count awake") self._logger.debug("_get_master_count role: " + self._ctx.this_node.role) self._ctx.resource_lock.acquire() try: if self._ctx.this_node.role == "MASTER": expected_masters = 0 else: expected_masters = 1 self._logger.debug("master list length:" + str(len(self._ctx.master_list))) self._logger.debug(" expected masters" + str(expected_masters)) if len(self._ctx.master_list) < expected_masters: ret = "TOO_LOW" elif len(self._ctx.master_list) > expected_masters: ret = "TOO_HIGH" else: ret = "FINE" # if self._ctx.this_node.role == "SLAVE" and self._ctx.master_list: # if self._ctx.my_master not in self._ctx.master_list: # self.assign_master(self._ctx.master_list[0]) except: # print("_get_master_count exception: " + sys.exc_info()) self._logger.debug("STRANGE") util.log_exception(sys.exc_info()) finally: self._ctx.resource_lock.release() self._logger.debug("_get_master_count EXIT returning " + str(ret)) return ret
def get_relationship_type_list(project_id): result = {} relationship_type_list = None try: result = {} relationship_type_list = models.get_relationship_type_list(project_id) result["resultOK"] = True result["list"] = relationship_type_list except Exception as e: result["resultOK"] = False result["message"] = str(Exception) log_exception(e) return dumps(result, ensure_ascii=False)
def get_entity_type_list(project_id): result = {} entity_type_list = None try: #project_id = str(request.json['project_id']) result = {} entity_type_list = models.get_entity_type_list(project_id) result["resultOK"] = True result["list"] = entity_type_list except Exception as e: result["resultOK"] = False result["message"] = str(Exception) log_exception(e) return dumps(result, ensure_ascii=False)
def _continue_as_master(self): """Returns True if a node should continue in master role""" try: ret = True my_pos = self._ctx.node_list.index(self._ctx.this_node) for m in self._ctx.master_list: master_pos = self._ctx.node_list.index(m) if master_pos < my_pos: ret = False break self._logger.info("Continuing as master: %s" % str(ret)) except ValueError: self._logger.debug("Active node list: %s" % self._ctx.active_node_list) self._logger.debug("Master list: %s" % self._ctx.master_list) self._logger.debug("Master: %s" % m) util.log_exception(sys.exc_info()) return ret
def save_all(project_id): result = {} try: type_system_diagram = request.json['typeSystemDiagram'] entity_types = request.json['entityTypes'] relation_types = request.json['relationTypes'] save_result = models.save_all(project_id=project_id, type_system_diagram=type_system_diagram, entity_types=entity_types, relation_types=relation_types) result["resultOK"] = True result["result"] = save_result except Exception as e: result["resultOK"] = False result["message"] = str(Exception) log_exception(e) return dumps(result, ensure_ascii=False)
def parse_bytes(file_name, addrlength=32): bytes_seq = bytearray() try: with gzip.open(file_name, 'rt') as fp: for line in fp.readlines(): if not line.strip(): continue else: mem_addr = int(addrlength/4) line = line[mem_addr:].strip() line = line.replace('?', '') # ignore '?' characters # store as bytearray for efficient memory management bytes_seq = bytes_seq + bytearray.fromhex(line) except Exception as e: print(e) log_exception(e, sys.argv[0], file_name) bytes_seq = None return bytes_seq
def run(): if len(sys.argv) < 2: exit() setGlobals() session_count = 0 with open(DATA_URLS) as json_file: data = json.load(json_file) for d in data['urls']: if session_count >= int(sys.argv[1]): break try: if not is_already_done(d['url']) and not is_paused(d['url']): check_price(d['url'], d['thresh']) session_count += 1 time.sleep(random.randint(30, 90)) # wait 30-90 seconds except Exception: log_exception('Exception for url "' + str(d['url']) + '":\r\n' + str(traceback.format_exc())) pause_execution(d['url']) # Skip this url for some time
def _sync_collections(self, a_node_list): """Read a_node_list, and update active_node_list and dead_node_list, if needed""" try: a_node_list[:] = self._ctx.nodelist_reader.read_node_list(self._ctx.this_node, self._ctx.mode) # Check if cluster scaled out, or just created # Fetch new nodes and add them to active_node_list nodes = [n for n in a_node_list if n not in self._ctx.active_node_list and n.ip_address not in self._ctx.dead_node_set] for m in nodes: self._ctx.active_node_list.append(m) if nodes: active_nodes_changed = True else: active_nodes_changed = False # Check if cluster scaled in # Remove node from active_node_list, if the node is not present any more # in the cluster nodes = [n for n in self._ctx.active_node_list if n not in a_node_list] for m in nodes: self._ctx.active_node_list.remove(m) if nodes: active_nodes_changed = True # Remove node from dead_node_set, if the node if the node is not present any more # in the cluster nodes = [ip for ip in self._ctx.dead_node_set if not util.find_node_by_ip(ip, a_node_list)] for m in nodes: self._ctx.dead_node_set.remove(m) except ValueError: self._logger.debug('2') util.log_exception(sys.exc_info()) return active_nodes_changed, a_node_list
def handle(self): self.server.ctx.resource_lock.acquire() self.server.logger.debug("Received Data") try: data = self.request[0].strip() json_object = json.loads(data) # Received heartbeat signal if json_object[0] == "node": self.handle_heartbeat(json_object) # Received active_node_list. Should be sent only to slaves elif json_object[0] == "active_node_list" and self._server.ctx.this_node.role == "SLAVE": print("recv act nl") self.handle_list(json_object) else: print("unexp") self._server.logger.warn("Received unexpected data") except (TypeError, RuntimeError): util.log_exception(sys.exc_info()) finally: self.server.ctx.resource_lock.release()
def read_node_list(self, my_node, mode): node_list = [] try: with open(self.nodelist_file, 'r') as f: i = 0 for line in f.readlines(): if len(line) > 1: node_data_list = line.split(None) if mode == "TEST": my_node.port = 11911 + i i += 1 node_list.append(node.Node( port=my_node.port, ip_address=node_data_list[0], hostname=node_data_list[2], machine_type=node_data_list[3], ip_address_public=node_data_list[1], instance_id=my_node.instance_id, cluster_id=my_node.cluster_id, machine_id=node_data_list[4], cloud_zone=my_node.cloud_zone)) except: util.log_exception(sys.exc_info()) return node_list
def _dead_node_scan(self): print('ENTER DEAD NODE SCAN**************************************') self.logger.debug('***********************ENTER DEAD node scan') # dead_node_list = [] self.resurrected_node_list = [] self._remove_expired_timers() # self._ctx.resource_lock.acquire() try: # Go through nodes, and check if some node's state changed. # Add dead and reborn nodes to appropriate lists for n in self._ctx.node_list: if self._ctx.this_node.ip_address == n.ip_address: print("---0---") continue path = os.path.join(self._ctx.conf.hm_root, self._ctx.conf.collectd_home, self._ctx.conf.collectd_rrd_dir, n.hostname) node_state = "NOT_CHANGED" known_as_dead = n.ip_address in self._ctx.dead_node_set or n.ip_address in self._ctx.new_dead_node_set for aaaa in self._ctx.dead_node_set: self.logger.debug("dead node node:" + str(aaaa)) for bbbb in self._ctx.new_dead_node_set: self.logger.debug("new dead node node:" + str(bbbb)) #print("n.ip_address, self._ctx.dead_node_set:" + n.ip_address self._ctx.dead_node_set) node_state = self._node_state(path, time.mktime(time.localtime()), known_as_dead) self.logger.debug( "******************************node_state, n.ip_address, self._ctx.dead_node_set:" + node_state + ',' + n.ip_address) if node_state == "CHANGED_TO_DEAD": print("---3---") #logger.debug("new_dead_node_set:: %s" % self._ctx.new_dead_node_set) if n.ip_address not in self._ctx.new_dead_node_set: print("---4---") self.logger.debug("Starting timed verification new dead node") self._ctx.new_dead_node_set.add(n.ip_address) ncv = self._start_node_creation_verifier(n) self._node_creation_verifier_list.append(ncv) if node_state == "CHANGED_TO_ALIVE": print("---5---") # logger.info("Found resurrected node, updating collections") if self._process_node_resurrection(n): print("---6---") self.resurrected_node_list.append(n) # After checking node's state, process lists if needed #if dead_node_list: # print("---7---") # self._process_active_node_list_change() # self._ctx.ntf_manager.process_node_status_alerts( # dead_node_list, "DEAD_NODE") if self.resurrected_node_list: print("---8---") self._process_active_node_list_change() self._ctx.ntf_manager.process_node_status_alerts( self.resurrected_node_list, "RESURRECTED_NODE") except: # _do_shutdown(sys.exc_info()) self.logger.debug("Strang 1") util.log_exception(sys.exc_info()) #pass #print("EXCEPTION in dead_node_scan()" + str(sys.exc_info())) finally: pass
def statistics(log, filename2cghubRecords, minmaxsize, verbose): states = {} centers = {} studies = {} sampleIDs = [{}, {}, {}, {}, {}, {}, {}, {}] diseases = {} analyte_codes = {} sample_types = {} strategies = {} platforms = {} refassems = {} models = {} for record in filename2cghubRecords.itervalues(): states[record.state] = states.setdefault(record.state, 0) + 1 centers[record.center_name] = centers.setdefault(record.center_name, 0) + 1 studies[record.study] = studies.setdefault(record.study, 0) + 1 diseases[record.disease_abbr] = diseases.setdefault(record.disease_abbr, 0) + 1 analyte_codes[record.analyte_code] = analyte_codes.setdefault(record.analyte_code, 0) + 1 sample_types[record.sample_type] = sample_types.setdefault(record.sample_type, 0) + 1 strategies[record.library_strategy] = strategies.setdefault(record.library_strategy, 0) + 1 platforms[record.platform] = platforms.setdefault(record.platform, 0) + 1 refassems[record.refassem_short_name] = refassems.setdefault(record.refassem_short_name, 0) + 1 models[record.platform_full_name] = models.setdefault(record.platform_full_name, 0) + 1 try: fields = record.legacy_sample_id.split('-') for index, field in enumerate(fields[:-3]): sampleIDs[index][field] = sampleIDs[index].setdefault(field, 0) + 1 for index, field in enumerate(fields[-3:]): sampleIDs[index + len(fields[:-3])][field] = sampleIDs[index + len(fields[:-3])].setdefault(field, 0) + 1 except: util.log_exception(log, 'problem splitting %s(%s:%s)' % (record.legacy_sample_id, index, field)) util.log_info(log, '\nStates') count = 0 for state, value in states.iteritems(): if count < 15: count += 1 util.log_info(log, '%s: %s' % (state, value)) else: util.log_info(log, '\t(of %s)' % (len(states.keys()))) break util.log_info(log, '') util.log_info(log, 'Centers') count = 0 for center, value in centers.iteritems(): if count < 15: count += 1 util.log_info(log, '%s: %s' % (center, value)) else: util.log_info(log, ' (of %s)' % (len(centers.keys()))) break util.log_info(log, '') util.log_info(log, 'Studies') count = 0 for studie, value in studies.iteritems(): if count < 15: count += 1 util.log_info(log, '%s: %s' % (studie, value)) else: util.log_info(log, '(of %s)' % (len(studies.keys()))) break util.log_info(log, '') if verbose: util.log_info(log, 'Sample ids:') count = 0 for sampleMap in sampleIDs: util.log_info(log, 'next part:') for sampleID, value in sampleMap.iteritems(): if count < 15: count += 1 util.log_info(log, '%s: %s' % (sampleID, value)) else: util.log_info(log, '(of %s)' % (len(sampleMap.keys()))) break util.log_info(log, '') count = 0 util.log_info(log, 'Diseases:') count = 0 for disease, value in diseases.iteritems(): count += 1 util.log_info(log, '%s: %s' % (disease, value)) util.log_info(log, '') util.log_info(log, 'Analyte codes:') count = 0 for analyte_code, value in analyte_codes.iteritems(): if count < 15: count += 1 util.log_info(log, '%s: %s' % (analyte_code, value)) else: util.log_info(log, '(of %s)' % (len(analyte_codes.keys()))) break util.log_info(log, '') util.log_info(log, 'Sample types') count = 0 for sample_type, value in sample_types.iteritems(): if count < 15: count += 1 util.log_info(log, '%s: %s' % (sample_type, value)) else: util.log_info(log, '(of %s)' % (len(sample_types.keys()))) break util.log_info(log, '') util.log_info(log, 'Strategies:') count = 0 for strategie, value in strategies.iteritems(): if count < 15: count += 1 util.log_info(log, '%s: %s' % (strategie, value)) else: util.log_info(log, '(of %s)' % (len(strategies.keys()))) break util.log_info(log, '') util.log_info(log, 'Platforms:') count = 0 for platform, value in platforms.iteritems(): if count < 15: count += 1 util.log_info(log, '%s: %s' % (platform, value)) else: util.log_info(log, '(of %s)' % (len(platforms.keys()))) break util.log_info(log, '') util.log_info(log, 'Reference Assembles:') count = 0 for refassem, value in refassems.iteritems(): if count < 15: count += 1 util.log_info(log, '%s: %s' % (refassem, value)) else: util.log_info(log, '(of %s)' % (len(refassems.keys()))) break util.log_info(log, '') util.log_info(log, 'Models:') count = 0 for model, value in models.iteritems(): if count < 15: count += 1 util.log_info(log, '%s: %s' % (model, value)) else: util.log_info(log, '(of %s)' % (len(models.keys()))) break util.log_info(log, '') util.log_info(log, '\n\t\tmax: %s\n\t\tmin: %s' % (minmaxsize['max'].write(), minmaxsize['min'].write()))
platformName = fullPlatform2platform[metadata['platform_full_name']] except KeyError, e: print "metadata['platform_full_name']" pprint.pprint(metadata['platform_full_name']) log_exception( log, 'KeyError in fullPlatform2platform[metadata["platform_full_name"]]: %s' % str(e)) centerName = shortname2centername[metadata['DataCenterCode']] assembly = metadata['GenomeReference'] try: moltype = analyte2strategy2moltype[metadata['analyte_code']][ metadata['library_strategy']] except Exception as e: log_exception( log, 'problem setting molecular type: \'%s\' \'%s\'' % (metadata['analyte_code'], metadata['library_strategy'])) raise e metadata['Platform'] = platformName + '_' + moltype metadata['Pipeline'] = centerName + '__' + moltype analyte2center_type = config['analyte2center_type'] if metadata['analyte_code'] != '': metadata['DataCenterType'] = analyte2center_type[ metadata['analyte_code']] shortname2centercodes = config['shortname2centercodes'] shortname = metadata['DataCenterCode'] if metadata['analyte_code'] in ('H', 'R', 'T'): metadata['DataCenterCode'] = shortname2centercodes[ metadata['DataCenterCode']][1]
def addPlatformPipelineFields(config, metadata, seen_bad_codes, log): fullPlatform2platform = config['uploadfullplatform2platform'] analyte2strategy2moltype = config['analyte2strategy2moltype'] shortname2centername = config['shortname2centername'] try: platformName = fullPlatform2platform[metadata['platform_full_name']] except KeyError, e: print "metadata['platform_full_name']" pprint.pprint(metadata['platform_full_name']) log_exception(log, 'KeyError in fullPlatform2platform[metadata["platform_full_name"]]: %s' % str(e)) centerName = shortname2centername[metadata['DataCenterCode']] assembly = metadata['GenomeReference'] try: moltype = analyte2strategy2moltype[metadata['analyte_code']][metadata['library_strategy']] except Exception as e: log_exception(log, 'problem setting molecular type: \'%s\' \'%s\'' % (metadata['analyte_code'], metadata['library_strategy'])) raise e metadata['Platform'] = platformName + '_' + moltype metadata['Pipeline'] = centerName + '__' + moltype analyte2center_type = config['analyte2center_type'] if metadata['analyte_code'] != '': metadata['DataCenterType'] = analyte2center_type[metadata['analyte_code']] shortname2centercodes = config['shortname2centercodes'] shortname = metadata['DataCenterCode'] if metadata['analyte_code'] in ('H', 'R', 'T'): metadata['DataCenterCode'] = shortname2centercodes[metadata['DataCenterCode']][1] elif metadata['analyte_code'] in ('D', 'W', 'X'): metadata['DataCenterCode'] = shortname2centercodes[metadata['DataCenterCode']][0] else:
def main(platform, type_uri = 'detail', log = None, removedups = False, limit = -1, verbose = False, print_response = False): util.log_info(log, 'begin reading cghub archive') filename2cghubRecords = {} minmaxsize = {'min': CGHubFileInfo('', 500000000000, ''), 'max': CGHubFileInfo('', 1, '')} try: # archives = util.getURLData(manifest_uri, 'latest_manifest', log) response = urllib.urlopen(manifest_uri) archives = response.read() lines = archives.split('\n') util.log_info(log, '\tarchive size is %s with %s lines' % (len(archives), len(lines))) util.log_info(log, '\n\t' + '\n\t'.join(lines[:10])) except Exception as e: util.log_exception(log, 'problem fetching latest_manifest: %s') raise e headers = lines[0].split('\t') column_index2header = {} for index, header in enumerate(headers): if header in header2record_index.keys(): column_index2header[index] = header count = 0 dupcount = 0 for line in lines[1:]: if not line: continue if 0 == count % 4096: util.log_info(log, 'processed %s records' % (count)) count += 1 fields = line.split('\t') header2record = {} try: for index in column_index2header.keys(): header2record[header2record_index[column_index2header[index]]] = fields[index] except Exception as e: util.log_info(log, 'problem with parsing line(%s): %s' % (count, line)) if platform not in header2record[CGHubRecordInfo.study_index]: continue header2record.update(index2none) record = CGHubRecordInfo(header2record) filename = header2record[CGHubRecordInfo.bamFilename_index] if removedups and filename in filename2cghubRecords: if 'Live' == header2record[CGHubRecordInfo.state_index]: dupcount += 1 # check the dates and keep the latest currentdate = createDateTime(filename2cghubRecords[filename].upload_date) newdate = createDateTime(record.upload_date) if currentdate < newdate: filename2cghubRecords[filename] = record else: filename2cghubRecords[filename] = record if 'Live' == record.state: if minmaxsize['min'].filesize > record.files['bam'].filesize and record.files['bam'].filesize: minmaxsize['min'] = record.files['bam'] if minmaxsize['max'].filesize < record.files['bam'].filesize: minmaxsize['max'] = record.files['bam'] if not record.files['bam'].filesize: util.log_info(log, 'no file size: %s--%s' % (record.write(), record.files['bam'].write())) statistics(log, filename2cghubRecords, minmaxsize, verbose) util.log_info(log, 'finished reading cghub archive. %s total records, %s duplicates' % (count, dupcount)) return filename2cghubRecords.values(), minmaxsize
def main(platform, type_uri='detail', log=None, removedups=False, limit=-1, verbose=False, print_response=False): util.log_info(log, 'begin reading cghub archive') filename2cghubRecords = {} minmaxsize = { 'min': CGHubFileInfo('', 500000000000, ''), 'max': CGHubFileInfo('', 1, '') } try: # archives = util.getURLData(manifest_uri, 'latest_manifest', log) response = urllib.urlopen(manifest_uri) archives = response.read() lines = archives.split('\n') util.log_info( log, '\tarchive size is %s with %s lines' % (len(archives), len(lines))) util.log_info(log, '\n\t' + '\n\t'.join(lines[:10])) except Exception as e: util.log_exception(log, 'problem fetching latest_manifest: %s') raise e headers = lines[0].split('\t') column_index2header = {} for index, header in enumerate(headers): if header in header2record_index.keys(): column_index2header[index] = header count = 0 dupcount = 0 for line in lines[1:]: if not line: continue if 0 == count % 4096: util.log_info(log, 'processed %s records' % (count)) count += 1 fields = line.split('\t') header2record = {} try: for index in column_index2header.keys(): header2record[header2record_index[ column_index2header[index]]] = fields[index] except Exception as e: util.log_info(log, 'problem with parsing line(%s): %s' % (count, line)) if platform not in header2record[CGHubRecordInfo.study_index]: continue header2record.update(index2none) record = CGHubRecordInfo(header2record) filename = header2record[CGHubRecordInfo.bamFilename_index] if removedups and filename in filename2cghubRecords: if 'Live' == header2record[CGHubRecordInfo.state_index]: dupcount += 1 # check the dates and keep the latest currentdate = createDateTime( filename2cghubRecords[filename].upload_date) newdate = createDateTime(record.upload_date) if currentdate < newdate: filename2cghubRecords[filename] = record else: filename2cghubRecords[filename] = record if 'Live' == record.state: if minmaxsize['min'].filesize > record.files[ 'bam'].filesize and record.files['bam'].filesize: minmaxsize['min'] = record.files['bam'] if minmaxsize['max'].filesize < record.files['bam'].filesize: minmaxsize['max'] = record.files['bam'] if not record.files['bam'].filesize: util.log_info( log, 'no file size: %s--%s' % (record.write(), record.files['bam'].write())) statistics(log, filename2cghubRecords, minmaxsize, verbose) util.log_info( log, 'finished reading cghub archive. %s total records, %s duplicates' % (count, dupcount)) return filename2cghubRecords.values(), minmaxsize, archives
def statistics(log, filename2cghubRecords, minmaxsize, verbose): states = {} centers = {} studies = {} sampleIDs = [{}, {}, {}, {}, {}, {}, {}, {}] diseases = {} analyte_codes = {} sample_types = {} strategies = {} platforms = {} refassems = {} models = {} for record in filename2cghubRecords.itervalues(): states[record.state] = states.setdefault(record.state, 0) + 1 centers[record.center_name] = centers.setdefault( record.center_name, 0) + 1 studies[record.study] = studies.setdefault(record.study, 0) + 1 diseases[record.disease_abbr] = diseases.setdefault( record.disease_abbr, 0) + 1 analyte_codes[record.analyte_code] = analyte_codes.setdefault( record.analyte_code, 0) + 1 sample_types[record.sample_type] = sample_types.setdefault( record.sample_type, 0) + 1 strategies[record.library_strategy] = strategies.setdefault( record.library_strategy, 0) + 1 platforms[record.platform] = platforms.setdefault(record.platform, 0) + 1 refassems[record.refassem_short_name] = refassems.setdefault( record.refassem_short_name, 0) + 1 models[record.platform_full_name] = models.setdefault( record.platform_full_name, 0) + 1 try: fields = record.legacy_sample_id.split('-') for index, field in enumerate(fields[:-3]): sampleIDs[index][field] = sampleIDs[index].setdefault( field, 0) + 1 for index, field in enumerate(fields[-3:]): sampleIDs[index + len(fields[:-3])][field] = sampleIDs[ index + len(fields[:-3])].setdefault(field, 0) + 1 except: util.log_exception( log, 'problem splitting %s(%s:%s)' % (record.legacy_sample_id, index, field)) util.log_info(log, '\nStates') count = 0 for state, value in states.iteritems(): if count < 15: count += 1 util.log_info(log, '%s: %s' % (state, value)) else: util.log_info(log, '\t(of %s)' % (len(states.keys()))) break util.log_info(log, '') util.log_info(log, 'Centers') count = 0 for center, value in centers.iteritems(): if count < 15: count += 1 util.log_info(log, '%s: %s' % (center, value)) else: util.log_info(log, ' (of %s)' % (len(centers.keys()))) break util.log_info(log, '') util.log_info(log, 'Studies') count = 0 for studie, value in studies.iteritems(): if count < 15: count += 1 util.log_info(log, '%s: %s' % (studie, value)) else: util.log_info(log, '(of %s)' % (len(studies.keys()))) break util.log_info(log, '') if verbose: util.log_info(log, 'Sample ids:') count = 0 for sampleMap in sampleIDs: util.log_info(log, 'next part:') for sampleID, value in sampleMap.iteritems(): if count < 15: count += 1 util.log_info(log, '%s: %s' % (sampleID, value)) else: util.log_info(log, '(of %s)' % (len(sampleMap.keys()))) break util.log_info(log, '') count = 0 util.log_info(log, 'Diseases:') count = 0 for disease, value in diseases.iteritems(): count += 1 util.log_info(log, '%s: %s' % (disease, value)) util.log_info(log, '') util.log_info(log, 'Analyte codes:') count = 0 for analyte_code, value in analyte_codes.iteritems(): if count < 15: count += 1 util.log_info(log, '%s: %s' % (analyte_code, value)) else: util.log_info(log, '(of %s)' % (len(analyte_codes.keys()))) break util.log_info(log, '') util.log_info(log, 'Sample types') count = 0 for sample_type, value in sample_types.iteritems(): if count < 15: count += 1 util.log_info(log, '%s: %s' % (sample_type, value)) else: util.log_info(log, '(of %s)' % (len(sample_types.keys()))) break util.log_info(log, '') util.log_info(log, 'Strategies:') count = 0 for strategie, value in strategies.iteritems(): if count < 15: count += 1 util.log_info(log, '%s: %s' % (strategie, value)) else: util.log_info(log, '(of %s)' % (len(strategies.keys()))) break util.log_info(log, '') util.log_info(log, 'Platforms:') count = 0 for platform, value in platforms.iteritems(): if count < 15: count += 1 util.log_info(log, '%s: %s' % (platform, value)) else: util.log_info(log, '(of %s)' % (len(platforms.keys()))) break util.log_info(log, '') util.log_info(log, 'Reference Assembles:') count = 0 for refassem, value in refassems.iteritems(): if count < 15: count += 1 util.log_info(log, '%s: %s' % (refassem, value)) else: util.log_info(log, '(of %s)' % (len(refassems.keys()))) break util.log_info(log, '') util.log_info(log, 'Models:') count = 0 for model, value in models.iteritems(): if count < 15: count += 1 util.log_info(log, '%s: %s' % (model, value)) else: util.log_info(log, '(of %s)' % (len(models.keys()))) break util.log_info(log, '') util.log_info( log, '\n\t\tmax: %s\n\t\tmin: %s' % (minmaxsize['max'].write(), minmaxsize['min'].write()))
def byte_ngram(files_list, addrlength=32, n=1): dicts_list = [] total_files = len(files_list) bad_files_names = [] for idx, file_name in enumerate(files_list): bytes_file = DATASET_DIR + file_name + '.bytes.gz' try: with gzip.open(bytes_file, 'rt') as fp: bytedict = {} hex_seq = "" for line in fp.readlines(): if not line.strip(): continue else: address = int(addrlength / 4) # hex to bytes # ensure that addresses values will not be counted # in the ngram calculation hex_seq = hex_seq + line[address:].strip() hex_seq = hex_seq.replace(" ", "") for i in range(0, len(hex_seq) - 1, 2): # ignore bytes that contain the "?" character if hex_seq[i] == "?" or hex_seq[i + 1] == "?": continue if 2 * n + i > len(hex_seq): break gram = hex_seq[i:(2 * n + i)] if gram not in bytedict.keys(): bytedict[gram] = 1 else: bytedict[gram] += 1 dicts_list.append(bytedict) except Exception as e: bad_files_names.append(file_name) log_exception(e, sys.argv[0], bytes_file) # progress bars always save my sanity progress_bar(idx + 1, total_files, 50) # log the corrupted files for future reference if len(bad_files_names) > 0: with open('bad_bytes_files.txt', 'w') as bfp: for name in bad_files_names: bfp.write(name + '.bytes\n') # convert list of dictionaries to a byte ngram count numpy array vec = DictVectorizer() ngram_freq = vec.fit_transform(dicts_list).toarray() ngram_freq_df = pd.DataFrame(ngram_freq, columns=vec.get_feature_names()) # store frequency of each byte ngram ngram_freq_df.to_csv('features/' + str(n) + 'gram_byte_freq.csv') save_obj(ngram_freq_df, str(n) + 'gram_byte_freq') # transform ngram frequency array to ngram tfidf array transformer = TfidfTransformer(smooth_idf=False) ngram_tfidf = transformer.fit_transform(ngram_freq) # store tfidf of each byte ngram ngram_tfidf_df = pd.DataFrame(ngram_tfidf.todense(), columns=vec.get_feature_names()) ngram_tfidf_df.to_csv('features/' + str(n) + 'gram_byte_tfidf.csv') save_obj(ngram_tfidf_df, str(n) + 'gram_byte_tfidf') return ngram_tfidf_df
def _set_master(self): if not self._ctx.node_list: util.log_exception(sys.exc_info(), "Unable to set a master for the node") self.shutdown() self._assign_master(self._ctx.node_list[0])
def check_node_still_dead(self, node): print(" **************************ENTER check_node_still_dead") # ogger = logging.getLogger('nodechecker.ncv') self.logger.debug('*****************************ENTER check_node_still_dead') # global active_node_list # global new_dead_node_set # global dead_node_set # global min_time_diff # now = time.mktime(time.localtime()) self.logger.debug("hm root" + self._ctx.conf.hm_root) #path = os.path.join(self._ctx.conf.hm_root, self._ # ctx.conf.collectd_home, self._ctx.conf.collectd_rrd_dir, n.hostname) path = os.path.join(self._ctx.conf.hm_root, self._ctx.conf.collectd_home, self._ctx.conf.collectd_rrd_dir, node.hostname) path1 = os.path.join(self._ctx.conf.hm_root, self._ctx.conf.collectd_home, self._ctx.conf.collectd_rrd_dir) path2 = os.path.join(self._ctx.conf.hm_root, self._ctx.conf.collectd_home) # path3= os.path.join(self._ctx.conf.hm_root, self._ctx.conf.collectd_home, self. # _ctx.conf.collectd_rrd_dir, node.hostname) self.logger.debug("path1" + path1) self.logger.debug("path2" + path2) self._ctx.resource_lock.acquire() try: self._ctx.min_time_diff = -1 self.logger.debug("path" + path) os.path.walk(path, self.find_minimal_rrd_timestamp, [time.mktime(time.localtime())]) diff = self._ctx.min_time_diff self.logger.debug("dif:" + str(diff)) self.logger.debug("timeout" + str(self._ctx.dead_node_timeout)) if 0 < diff < self._ctx.dead_node_timeout: self.logger.debug('check_node_still_dead() node is alive: Diff < self._ctx.dead_node_timeout') pass else: self.logger.debug('check_node_still_dead() node is dead' + node.ip_address) if node in self._ctx.active_node_list: self._ctx.active_node_list.remove(node) else: print("this is strange") self._ctx.dead_node_set.add(node.ip_address) util.send(self._ctx.this_node, self._ctx.node_list, util.json_from_list( self._ctx.active_node_list, 'active_node_list')) self.logger.debug("process node status alerts...") self._ctx.ntf_manager.process_node_status_alerts([node], "DEAD_NODE") self.logger.debug("storing list to file...") util.store_list_to_file( self._ctx.active_node_list, self._ctx.active_node_list_file, self._ctx.this_node.group_name) #self.logger.debug("removing node from set of new dead...") #self._ctx.new_dead_node_set.remove(node.ip_address) except: self.logger.debug("Star 3") util.log_exception(sys.exc_info()) print(" EXCEPTION in check_node_still_dead" + str(sys.exc_info())) pass finally: self._ctx.new_dead_node_set.remove(node.ip_address) #self.logger.debug("self._ctx.new_dead_node_set.remove" + str(len(self._ctx.new_dead_node_set))) self._ctx.resource_lock.release() print(" exit check_node_still_dead")