def _wait_for_machine_configured(self, file_reader):
        """In case of nosql and bigdata CMT is changing hostname, wait for that
           action being complete"""

        total_sleep_time = 0
        wait_for_conf = False
        for n in self._ctx.node_list:
            machine_type = file_reader.read_attribute(n.ip_address, 'MACHINE_TYPE')
            if machine_type == 'manager':
                wait_for_conf = True
                break
        if wait_for_conf:
            while True:
                if util.get_hostname() != self._ctx.this_node.hostname:
                    self._logger.debug("Sleep")
                    total_sleep_time += self._ctx.CMT_CONF_WAIT
                    if total_sleep_time >= self._ctx.MAX_CMT_CONF_WAIT:
                        util.log_exception("Waiting for machine configurtion took too long")
                        self.shutdown()
                    time.sleep(self._ctx.CMT_CONF_WAIT)
                else:
                    # sleep once more before the exit: to make sure that hostname
                    # change propagated
                    time.sleep(self._ctx.CMT_CONF_WAIT)
                    break
Example #2
0
def extract_xref(files_list):
    # total number of files to calculate completion percentage
    total_files = len(files_list)
    bad_files_names = []
    # Extract all features related to DATA and CODE XREF
    xref_dict = xref_initialization()
    for idx, file_name in enumerate(files_list):
        asm_file = DATASET_DIR + 'train/' + file_name + '.asm.gz'
        try:
            get_xref_features(asm_file, xref_dict)
        except Exception as e:
            # log corrupted files for future correction
            log_exception(e, sys.argv[0], asm_file)
            bad_files_idx.append(idx)
            bad_files_names.append(file_name)
        progress_bar(idx+1, total_files, 50)

    xref_pd = pd.DataFrame.from_dict(xref_dict)
    # store xref features to avoid recalculation
    save_obj(xref_pd, 'xref_features')
    '''
    save_obj(bad_files_names, 'bad_asm_files')
    # drop corrupted files (if any) from the training set
    if len(bad_files_names) > 0:
        # log the number of corrupted files
        logging.info('XREF Feature Extraction completed: ' + 
                str(len(bad_files_names)) + ' file(s) are corrupted.')
        # store the corrupted files names in 'bad_asm_files.txt'
        with open('bad_asm_files.txt', 'w') as bfp:
            for name in bad_files_names:
                bfp.write(name + '.asm')
    '''
    # save xref features dataframe to csv file to keep results (optional)
    xref_pd.to_csv('features/xref_features.csv', index=False)
    return xref_pd
Example #3
0
def run():
	try:
		init()
		if len(sys.argv) > 1:
			switch(str(sys.argv[1]))
		else:
			run_all()
	except Exception as e:
		log_exception(e)
def addPlatformPipelineFields(config, metadata, seen_bad_codes, log):
    fullPlatform2platform = config['uploadfullplatform2platform']
    analyte2strategy2moltype = config['analyte2strategy2moltype']
    shortname2centername = config['shortname2centername']
    try:
        platformName = fullPlatform2platform[metadata['platform_full_name']]
    except KeyError, e:
        print "metadata['platform_full_name']"
        pprint.pprint(metadata['platform_full_name'])
        log_exception(log, 'KeyError in fullPlatform2platform[metadata["platform_full_name"]]: %s' % str(e))
 def _slave_loop(self, a_node_list):
     self._logger.info("Slave Loop start")
     while self._continue:
         try:
             self._sync_collections(a_node_list)
             if self._get_master_count(self._ctx.node_master_timeout) == "TOO_LOW":
                 break
             if self._ctx.my_master != self._ctx.master_list[0]:
                 self._assign_master(self._ctx.master_list[0])
         except:
             util.log_exception(sys.exc_info())
             self.shutdown()
 def read_attribute(self, ip_address, attr_type):
     try:
         attr_pos = self.node_attributes.index(attr_type)
         attr_ip_pos = self.node_attributes.index('IP_ADDRESS')
         with open(self.nodelist_file, 'r') as f:
             for line in f.readlines():
                 if len(line) > 1:
                     node_data_list = line.split(None)
                     if ip_address == node_data_list[attr_ip_pos]:
                         return node_data_list[attr_pos]
     except:
         util.log_exception(sys.exc_info())
Example #7
0
def get_type_system_diagram(project_id):
    result = {}
    try:
        type_system_diagram = models.get_type_system_diagram(project_id)
        result["resultOK"] = True
        result["result"] = type_system_diagram
    except Exception as e:
        result["resultOK"] = False
        result["message"] = str(Exception)
        log_exception(e)

    return dumps(result, ensure_ascii=False)
Example #8
0
def addPlatformPipelineFields(config, metadata, seen_bad_codes, log):
    fullPlatform2platform = config['uploadfullplatform2platform']
    analyte2strategy2moltype = config['analyte2strategy2moltype']
    shortname2centername = config['shortname2centername']
    try:
        platformName = fullPlatform2platform[metadata['platform_full_name']]
    except KeyError, e:
        print "metadata['platform_full_name']"
        pprint.pprint(metadata['platform_full_name'])
        log_exception(
            log,
            'KeyError in fullPlatform2platform[metadata["platform_full_name"]]: %s'
            % str(e))
Example #9
0
def main():
    train_labels = pd.read_csv(DATASET_DIR + 'trainLabels.csv')
    files_list = train_labels['Id'].tolist()
    
    # total number of files to calculate completion percentage
    total_files = len(files_list)

    # do not count corrupted files
    bad_files_idx = []
    bad_files_names = []
    # Extract all features related to DATA and CODE XREF
    xref_dict = xref_initialization()
    for idx, file_name in enumerate(files_list):
        asm_file = DATASET_DIR + 'train/' + file_name + '.asm.gz'
        try:
            get_xref_features(asm_file, xref_dict)
        except Exception as e:
            # log corrupted files for future correction
            log_exception(e, sys.argv[0], asm_file)
            bad_files_idx.append(idx)
            bad_files_names.append(file_name)

        progress_bar(idx+1, total_files, 50)

    xref_pd = pd.DataFrame.from_dict(xref_dict)
    
    # store xref features to avoid recalculation
    save_obj(xref_pd, 'xref_features')
    save_obj(bad_files_names, 'bad_files')

    # concat features with classes and IDs to create the dataset
    data = pd.concat([train_labels, xref_pd], axis=1, sort=False)

    # drop corrupted files (if any) from the training set
    if len(bad_files_idx) > 0:
        data.drop(data.index[bad_files_idx], inplace=True)
        data = data.reset_index(drop=True)
        # log the number of corrupted files
        logging.info('XREF Feature Extraction completed: ' + 
                str(len(bad_files_idx)) + ' file(s) are corrupted.')
        # store the corrupted files names in 'bad_asm_files.txt'
        with open('bad_asm_files.txt', 'w') as bfp:
            for name in bad_files_names:
                bfp.write(name + '.asm.gz')

    # save xref features dataframe to csv file to keep results (optional)
    data.to_csv('results/xref_features.csv')
    
    '''
    def _get_master_count(self, heartbeat_periods=1):
        """Listens to master heartbeat signals.
        Depending on of number of received signals, a decision is made on
        how to proceed:
            - In case of too small number of signals, the node attempts to be
            itself a master.
            - In case of too big number of signals, if the node is a slave, it
            checks if it should itself run as a slave.
        """
        self._logger.debug("_get_master_count ENTER")

        ret = "FINE"
        self._ctx.heartbeats_received = 0
        self._ctx.master_list[:] = []

        # Sleep, count masters when awake
        self._logger.debug("_get_master_count sleep")
        time.sleep(heartbeat_periods * self._ctx.heartbeat_period)
        print("_get_master_count awake")
        self._logger.debug("_get_master_count role: " + self._ctx.this_node.role)

        self._ctx.resource_lock.acquire()
        try:
            if self._ctx.this_node.role == "MASTER":
                expected_masters = 0
            else:
                expected_masters = 1

            self._logger.debug("master list length:" + str(len(self._ctx.master_list)))
            self._logger.debug(" expected masters" + str(expected_masters))
            if len(self._ctx.master_list) < expected_masters:
                ret = "TOO_LOW"
            elif len(self._ctx.master_list) > expected_masters:
                ret = "TOO_HIGH"
            else:
                ret = "FINE"

                # if self._ctx.this_node.role == "SLAVE" and self._ctx.master_list:
                #    if self._ctx.my_master not in self._ctx.master_list:
                #        self.assign_master(self._ctx.master_list[0])

        except:
            # print("_get_master_count exception: " + sys.exc_info())
            self._logger.debug("STRANGE")
            util.log_exception(sys.exc_info())
        finally:
            self._ctx.resource_lock.release()
        self._logger.debug("_get_master_count EXIT returning " + str(ret))
        return ret
Example #11
0
def get_relationship_type_list(project_id):
    result = {}
    relationship_type_list = None

    try:
        result = {}
        relationship_type_list = models.get_relationship_type_list(project_id)
        result["resultOK"] = True
        result["list"] = relationship_type_list

    except Exception as e:
        result["resultOK"] = False
        result["message"] = str(Exception)
        log_exception(e)

    return dumps(result, ensure_ascii=False)
Example #12
0
def get_entity_type_list(project_id):
    result = {}
    entity_type_list = None

    try:
        #project_id = str(request.json['project_id'])
        result = {}
        entity_type_list = models.get_entity_type_list(project_id)
        result["resultOK"] = True
        result["list"] = entity_type_list

    except Exception as e:
        result["resultOK"] = False
        result["message"] = str(Exception)
        log_exception(e)

    return dumps(result, ensure_ascii=False)
 def _continue_as_master(self):
     """Returns True if a node should continue in master role"""
     try:
         ret = True
         my_pos = self._ctx.node_list.index(self._ctx.this_node)
         for m in self._ctx.master_list:
             master_pos = self._ctx.node_list.index(m)
             if master_pos < my_pos:
                 ret = False
                 break
         self._logger.info("Continuing as master: %s" % str(ret))
     except ValueError:
         self._logger.debug("Active node list: %s" % self._ctx.active_node_list)
         self._logger.debug("Master list: %s" % self._ctx.master_list)
         self._logger.debug("Master: %s" % m)
         util.log_exception(sys.exc_info())
     return ret
Example #14
0
def save_all(project_id):
    result = {}

    try:
        type_system_diagram = request.json['typeSystemDiagram']
        entity_types = request.json['entityTypes']
        relation_types = request.json['relationTypes']

        save_result = models.save_all(project_id=project_id, type_system_diagram=type_system_diagram, entity_types=entity_types, relation_types=relation_types)
        result["resultOK"] = True
        result["result"] = save_result

    except Exception as e:
        result["resultOK"] = False
        result["message"] = str(Exception)
        log_exception(e)

    return dumps(result, ensure_ascii=False)
def parse_bytes(file_name, addrlength=32):
    bytes_seq = bytearray()
    try:
        with gzip.open(file_name, 'rt') as fp:
            for line in fp.readlines():
                if not line.strip():
                    continue
                else:
                    mem_addr = int(addrlength/4)
                    line = line[mem_addr:].strip()
                    line = line.replace('?', '')  # ignore '?' characters
                    # store as bytearray for efficient memory management
                    bytes_seq = bytes_seq + bytearray.fromhex(line)
    except Exception as e:
        print(e)
        log_exception(e, sys.argv[0], file_name)
        bytes_seq = None
    return bytes_seq
Example #16
0
def run():
    if len(sys.argv) < 2:
        exit()

    setGlobals()
    session_count = 0

    with open(DATA_URLS) as json_file:
        data = json.load(json_file)
        for d in data['urls']:
            if session_count >= int(sys.argv[1]):
                break
            try:
                if not is_already_done(d['url']) and not is_paused(d['url']):
                    check_price(d['url'], d['thresh'])
                    session_count += 1
                    time.sleep(random.randint(30, 90))  # wait 30-90 seconds
            except Exception:
                log_exception('Exception for url "' + str(d['url']) +
                              '":\r\n' + str(traceback.format_exc()))
                pause_execution(d['url'])  # Skip this url for some time
    def _sync_collections(self, a_node_list):
        """Read a_node_list, and update active_node_list and dead_node_list,
        if needed"""

        try:
            a_node_list[:] = self._ctx.nodelist_reader.read_node_list(self._ctx.this_node, self._ctx.mode)

            # Check if cluster scaled out, or just created
            # Fetch new nodes and add them to active_node_list
            nodes = [n for n in a_node_list if n not in self._ctx.active_node_list and
                     n.ip_address not in self._ctx.dead_node_set]
            for m in nodes:
                self._ctx.active_node_list.append(m)
            if nodes:
                active_nodes_changed = True
            else:
                active_nodes_changed = False

            # Check if cluster scaled in
            # Remove node from active_node_list, if the node is not present any more
            # in the cluster
            nodes = [n for n in self._ctx.active_node_list if n not in a_node_list]
            for m in nodes:
                self._ctx.active_node_list.remove(m)
            if nodes:
                active_nodes_changed = True

            # Remove node from dead_node_set, if the node if the node is not present any more
            # in the cluster
            nodes = [ip for ip in self._ctx.dead_node_set
                     if not util.find_node_by_ip(ip, a_node_list)]
            for m in nodes:
                self._ctx.dead_node_set.remove(m)

        except ValueError:
            self._logger.debug('2')
            util.log_exception(sys.exc_info())

        return active_nodes_changed, a_node_list
    def handle(self):
        self.server.ctx.resource_lock.acquire()
        self.server.logger.debug("Received Data")
	try:
            data = self.request[0].strip()
            json_object = json.loads(data)

            # Received heartbeat signal
            if json_object[0] == "node":
                self.handle_heartbeat(json_object)

            # Received active_node_list. Should be sent only to slaves
            elif json_object[0] == "active_node_list" and self._server.ctx.this_node.role == "SLAVE":
                print("recv act nl")
                self.handle_list(json_object)

            else:
                print("unexp")
                self._server.logger.warn("Received unexpected data")
        except (TypeError, RuntimeError):
            util.log_exception(sys.exc_info())
        finally:
            self.server.ctx.resource_lock.release()
 def read_node_list(self, my_node, mode):
     node_list = []
     try:
         with open(self.nodelist_file, 'r') as f:
             i = 0
             for line in f.readlines():
                 if len(line) > 1:
                     node_data_list = line.split(None)
                     if mode == "TEST":
                         my_node.port = 11911 + i
                     i += 1
                     node_list.append(node.Node(
                         port=my_node.port,
                         ip_address=node_data_list[0],
                         hostname=node_data_list[2],
                         machine_type=node_data_list[3],
                         ip_address_public=node_data_list[1],
                         instance_id=my_node.instance_id,
                         cluster_id=my_node.cluster_id,
                         machine_id=node_data_list[4],
                         cloud_zone=my_node.cloud_zone))
     except:
         util.log_exception(sys.exc_info())
     return node_list
Example #20
0
    def _dead_node_scan(self):
        print('ENTER DEAD NODE SCAN**************************************')
        self.logger.debug('***********************ENTER DEAD node scan')
        # dead_node_list = []
        self.resurrected_node_list = []

        self._remove_expired_timers()
        # self._ctx.resource_lock.acquire()
        try:

            # Go through nodes, and check if some node's state changed.
            # Add dead and reborn nodes to appropriate lists
            for n in self._ctx.node_list:
                if self._ctx.this_node.ip_address == n.ip_address:
                    print("---0---")
                    continue
                path = os.path.join(self._ctx.conf.hm_root, self._ctx.conf.collectd_home,
                                    self._ctx.conf.collectd_rrd_dir, n.hostname)
                node_state = "NOT_CHANGED"

                known_as_dead = n.ip_address in self._ctx.dead_node_set or n.ip_address in self._ctx.new_dead_node_set
                for aaaa in self._ctx.dead_node_set:
                    self.logger.debug("dead node node:" + str(aaaa))

                for bbbb in self._ctx.new_dead_node_set:
                    self.logger.debug("new dead node node:" + str(bbbb))


                #print("n.ip_address, self._ctx.dead_node_set:" + n.ip_address self._ctx.dead_node_set)
                node_state = self._node_state(path, time.mktime(time.localtime()), known_as_dead)
                self.logger.debug(
                    "******************************node_state, n.ip_address, self._ctx.dead_node_set:" + node_state + ',' + n.ip_address)

                if node_state == "CHANGED_TO_DEAD":
                    print("---3---")
                    #logger.debug("new_dead_node_set:: %s" % self._ctx.new_dead_node_set)
                    if n.ip_address not in self._ctx.new_dead_node_set:
                        print("---4---")
                        self.logger.debug("Starting timed verification new dead node")
                        self._ctx.new_dead_node_set.add(n.ip_address)
                        ncv = self._start_node_creation_verifier(n)
                        self._node_creation_verifier_list.append(ncv)

                if node_state == "CHANGED_TO_ALIVE":
                    print("---5---")
                    # logger.info("Found resurrected node, updating collections")
                    if self._process_node_resurrection(n):
                        print("---6---")
                        self.resurrected_node_list.append(n)

            # After checking node's state, process lists if needed                
            #if dead_node_list:
            #   print("---7---")
            #   self._process_active_node_list_change()
            #   self._ctx.ntf_manager.process_node_status_alerts(
            #           dead_node_list, "DEAD_NODE")

            if self.resurrected_node_list:
                print("---8---")
                self._process_active_node_list_change()
                self._ctx.ntf_manager.process_node_status_alerts(
                    self.resurrected_node_list, "RESURRECTED_NODE")

        except:
            # _do_shutdown(sys.exc_info())
            self.logger.debug("Strang 1")
            util.log_exception(sys.exc_info())
            #pass
            #print("EXCEPTION in dead_node_scan()" + str(sys.exc_info()))

        finally:
            pass
def statistics(log, filename2cghubRecords, minmaxsize, verbose):
    states = {}
    centers = {}
    studies = {}
    sampleIDs = [{}, {}, {}, {}, {}, {}, {}, {}]
    diseases = {}
    analyte_codes = {}
    sample_types = {}
    strategies = {}
    platforms = {}
    refassems = {}
    models = {}
    for record in filename2cghubRecords.itervalues():
        states[record.state] = states.setdefault(record.state, 0) + 1
        centers[record.center_name] = centers.setdefault(record.center_name, 0) + 1
        studies[record.study] = studies.setdefault(record.study, 0) + 1
        diseases[record.disease_abbr] = diseases.setdefault(record.disease_abbr, 0) + 1
        analyte_codes[record.analyte_code] = analyte_codes.setdefault(record.analyte_code, 0) + 1
        sample_types[record.sample_type] = sample_types.setdefault(record.sample_type, 0) + 1
        strategies[record.library_strategy] = strategies.setdefault(record.library_strategy, 0) + 1
        platforms[record.platform] = platforms.setdefault(record.platform, 0) + 1
        refassems[record.refassem_short_name] = refassems.setdefault(record.refassem_short_name, 0) + 1
        models[record.platform_full_name] = models.setdefault(record.platform_full_name, 0) + 1
        
        try:
            fields = record.legacy_sample_id.split('-')
            for index, field in enumerate(fields[:-3]):
                sampleIDs[index][field] = sampleIDs[index].setdefault(field, 0) + 1
            for index, field in enumerate(fields[-3:]):
                sampleIDs[index + len(fields[:-3])][field] = sampleIDs[index + len(fields[:-3])].setdefault(field, 0) + 1
        except:
            util.log_exception(log, 'problem splitting %s(%s:%s)' % (record.legacy_sample_id, index, field))
            
    util.log_info(log, '\nStates')
    count = 0
    for state, value in states.iteritems():
        if count < 15:
            count += 1
            util.log_info(log, '%s: %s' % (state, value))
        else:
            util.log_info(log, '\t(of %s)' % (len(states.keys())))
            break
    util.log_info(log, '')
    
    util.log_info(log, 'Centers')
    count = 0
    for center, value in centers.iteritems():
        if count < 15:
            count += 1
            util.log_info(log, '%s: %s' % (center, value))
        else:
            util.log_info(log, '	(of %s)' % (len(centers.keys())))
            break
    util.log_info(log, '')
    
    util.log_info(log, 'Studies')
    count = 0
    for studie, value in studies.iteritems():
        if count < 15:
            count += 1
            util.log_info(log, '%s: %s' % (studie, value))
        else:
            util.log_info(log, '(of %s)' % (len(studies.keys())))
            break
    util.log_info(log, '')
    
    if verbose:
        util.log_info(log, 'Sample ids:')
        count = 0
        for sampleMap in sampleIDs:
            util.log_info(log, 'next part:')
            for sampleID, value in sampleMap.iteritems():
                if count < 15:
                    count += 1
                    util.log_info(log, '%s: %s' % (sampleID, value))
                else:
                    util.log_info(log, '(of %s)' % (len(sampleMap.keys())))
                    break
            util.log_info(log, '')
            count = 0
    
    util.log_info(log, 'Diseases:')
    count = 0
    for disease, value in diseases.iteritems():
        count += 1
        util.log_info(log, '%s: %s' % (disease, value))
    util.log_info(log, '')
    
    util.log_info(log, 'Analyte codes:')
    count = 0
    for analyte_code, value in analyte_codes.iteritems():
        if count < 15:
            count += 1
            util.log_info(log, '%s: %s' % (analyte_code, value))
        else:
            util.log_info(log, '(of %s)' % (len(analyte_codes.keys())))
            break
    util.log_info(log, '')
    
    util.log_info(log, 'Sample types')
    count = 0
    for sample_type, value in sample_types.iteritems():
        if count < 15:
            count += 1
            util.log_info(log, '%s: %s' % (sample_type, value))
        else:
            util.log_info(log, '(of %s)' % (len(sample_types.keys())))
            break
    util.log_info(log, '')
    
    util.log_info(log, 'Strategies:')
    count = 0
    for strategie, value in strategies.iteritems():
        if count < 15:
            count += 1
            util.log_info(log, '%s: %s' % (strategie, value))
        else:
            util.log_info(log, '(of %s)' % (len(strategies.keys())))
            break
    util.log_info(log, '')
    
    util.log_info(log, 'Platforms:')
    count = 0
    for platform, value in platforms.iteritems():
        if count < 15:
            count += 1
            util.log_info(log, '%s: %s' % (platform, value))
        else:
            util.log_info(log, '(of %s)' % (len(platforms.keys())))
            break
    util.log_info(log, '')
    
    util.log_info(log, 'Reference Assembles:')
    count = 0
    for refassem, value in refassems.iteritems():
        if count < 15:
            count += 1
            util.log_info(log, '%s: %s' % (refassem, value))
        else:
            util.log_info(log, '(of %s)' % (len(refassems.keys())))
            break
    util.log_info(log, '')
    
    util.log_info(log, 'Models:')
    count = 0
    for model, value in models.iteritems():
        if count < 15:
            count += 1
            util.log_info(log, '%s: %s' % (model, value))
        else:
            util.log_info(log, '(of %s)' % (len(models.keys())))
            break
    
    util.log_info(log, '')
    util.log_info(log, '\n\t\tmax: %s\n\t\tmin: %s' % (minmaxsize['max'].write(), minmaxsize['min'].write()))
Example #22
0
        platformName = fullPlatform2platform[metadata['platform_full_name']]
    except KeyError, e:
        print "metadata['platform_full_name']"
        pprint.pprint(metadata['platform_full_name'])
        log_exception(
            log,
            'KeyError in fullPlatform2platform[metadata["platform_full_name"]]: %s'
            % str(e))
    centerName = shortname2centername[metadata['DataCenterCode']]
    assembly = metadata['GenomeReference']
    try:
        moltype = analyte2strategy2moltype[metadata['analyte_code']][
            metadata['library_strategy']]
    except Exception as e:
        log_exception(
            log, 'problem setting molecular type: \'%s\' \'%s\'' %
            (metadata['analyte_code'], metadata['library_strategy']))
        raise e
    metadata['Platform'] = platformName + '_' + moltype
    metadata['Pipeline'] = centerName + '__' + moltype

    analyte2center_type = config['analyte2center_type']
    if metadata['analyte_code'] != '':
        metadata['DataCenterType'] = analyte2center_type[
            metadata['analyte_code']]

    shortname2centercodes = config['shortname2centercodes']
    shortname = metadata['DataCenterCode']
    if metadata['analyte_code'] in ('H', 'R', 'T'):
        metadata['DataCenterCode'] = shortname2centercodes[
            metadata['DataCenterCode']][1]
def addPlatformPipelineFields(config, metadata, seen_bad_codes, log):
    fullPlatform2platform = config['uploadfullplatform2platform']
    analyte2strategy2moltype = config['analyte2strategy2moltype']
    shortname2centername = config['shortname2centername']
    try:
        platformName = fullPlatform2platform[metadata['platform_full_name']]
    except KeyError, e:
        print "metadata['platform_full_name']"
        pprint.pprint(metadata['platform_full_name'])
        log_exception(log, 'KeyError in fullPlatform2platform[metadata["platform_full_name"]]: %s' % str(e))
    centerName = shortname2centername[metadata['DataCenterCode']]
    assembly = metadata['GenomeReference']
    try:
        moltype = analyte2strategy2moltype[metadata['analyte_code']][metadata['library_strategy']]
    except Exception as e:
        log_exception(log, 'problem setting molecular type: \'%s\' \'%s\'' % (metadata['analyte_code'], metadata['library_strategy']))
        raise e
    metadata['Platform'] = platformName + '_' + moltype
    metadata['Pipeline'] = centerName + '__' + moltype
    
    analyte2center_type = config['analyte2center_type']
    if metadata['analyte_code'] != '':
        metadata['DataCenterType'] = analyte2center_type[metadata['analyte_code']]

    shortname2centercodes = config['shortname2centercodes']
    shortname = metadata['DataCenterCode']
    if metadata['analyte_code'] in ('H', 'R', 'T'):
        metadata['DataCenterCode'] = shortname2centercodes[metadata['DataCenterCode']][1]
    elif metadata['analyte_code'] in ('D', 'W', 'X'):
        metadata['DataCenterCode'] = shortname2centercodes[metadata['DataCenterCode']][0]
    else:
def main(platform, type_uri = 'detail', log = None, removedups = False, limit = -1, verbose = False, print_response = False):
    util.log_info(log, 'begin reading cghub archive')
    filename2cghubRecords = {}
    minmaxsize = {'min': CGHubFileInfo('', 500000000000, ''), 'max': CGHubFileInfo('', 1, '')}
    try:
#         archives = util.getURLData(manifest_uri, 'latest_manifest', log)
        response = urllib.urlopen(manifest_uri)
        archives = response.read()

        lines = archives.split('\n')
        util.log_info(log, '\tarchive size is %s with %s lines' % (len(archives), len(lines)))
        util.log_info(log, '\n\t' + '\n\t'.join(lines[:10]))
    except Exception as e:
        util.log_exception(log, 'problem fetching latest_manifest: %s')
        raise e
    
    headers = lines[0].split('\t')
    column_index2header = {}
    for index, header in enumerate(headers):
        if header in header2record_index.keys():
            column_index2header[index] = header
        
    count = 0
    dupcount = 0
    for line in lines[1:]:
        if not line:
            continue
        if 0 == count % 4096:
            util.log_info(log, 'processed %s records' % (count))
        count += 1
        fields = line.split('\t')
        header2record = {}
        try:
            for index in column_index2header.keys():
                header2record[header2record_index[column_index2header[index]]] = fields[index]
        except Exception as e:
            util.log_info(log, 'problem with parsing line(%s): %s' % (count, line))
        if platform not in header2record[CGHubRecordInfo.study_index]:
            continue
        header2record.update(index2none)
        record = CGHubRecordInfo(header2record)
        filename = header2record[CGHubRecordInfo.bamFilename_index]
        if removedups and filename in filename2cghubRecords:
            if 'Live' == header2record[CGHubRecordInfo.state_index]:
                dupcount += 1
                # check the dates and keep the latest
                currentdate = createDateTime(filename2cghubRecords[filename].upload_date)
                newdate = createDateTime(record.upload_date)
                if currentdate < newdate:
                    filename2cghubRecords[filename] = record
        else:
            filename2cghubRecords[filename] = record
        if 'Live' == record.state:
            if minmaxsize['min'].filesize > record.files['bam'].filesize and record.files['bam'].filesize:
                minmaxsize['min'] = record.files['bam']
            if minmaxsize['max'].filesize < record.files['bam'].filesize:
                minmaxsize['max'] = record.files['bam']
            if not record.files['bam'].filesize:
                util.log_info(log, 'no file size: %s--%s' % (record.write(), record.files['bam'].write()))

    statistics(log, filename2cghubRecords, minmaxsize, verbose)
    util.log_info(log, 'finished reading cghub archive.  %s total records, %s duplicates' % (count, dupcount))
    return filename2cghubRecords.values(), minmaxsize
Example #25
0
def main(platform,
         type_uri='detail',
         log=None,
         removedups=False,
         limit=-1,
         verbose=False,
         print_response=False):
    util.log_info(log, 'begin reading cghub archive')
    filename2cghubRecords = {}
    minmaxsize = {
        'min': CGHubFileInfo('', 500000000000, ''),
        'max': CGHubFileInfo('', 1, '')
    }
    try:
        #         archives = util.getURLData(manifest_uri, 'latest_manifest', log)
        response = urllib.urlopen(manifest_uri)
        archives = response.read()

        lines = archives.split('\n')
        util.log_info(
            log,
            '\tarchive size is %s with %s lines' % (len(archives), len(lines)))
        util.log_info(log, '\n\t' + '\n\t'.join(lines[:10]))
    except Exception as e:
        util.log_exception(log, 'problem fetching latest_manifest: %s')
        raise e

    headers = lines[0].split('\t')
    column_index2header = {}
    for index, header in enumerate(headers):
        if header in header2record_index.keys():
            column_index2header[index] = header

    count = 0
    dupcount = 0
    for line in lines[1:]:
        if not line:
            continue
        if 0 == count % 4096:
            util.log_info(log, 'processed %s records' % (count))
        count += 1
        fields = line.split('\t')
        header2record = {}
        try:
            for index in column_index2header.keys():
                header2record[header2record_index[
                    column_index2header[index]]] = fields[index]
        except Exception as e:
            util.log_info(log,
                          'problem with parsing line(%s): %s' % (count, line))
        if platform not in header2record[CGHubRecordInfo.study_index]:
            continue
        header2record.update(index2none)
        record = CGHubRecordInfo(header2record)
        filename = header2record[CGHubRecordInfo.bamFilename_index]
        if removedups and filename in filename2cghubRecords:
            if 'Live' == header2record[CGHubRecordInfo.state_index]:
                dupcount += 1
                # check the dates and keep the latest
                currentdate = createDateTime(
                    filename2cghubRecords[filename].upload_date)
                newdate = createDateTime(record.upload_date)
                if currentdate < newdate:
                    filename2cghubRecords[filename] = record
        else:
            filename2cghubRecords[filename] = record
        if 'Live' == record.state:
            if minmaxsize['min'].filesize > record.files[
                    'bam'].filesize and record.files['bam'].filesize:
                minmaxsize['min'] = record.files['bam']
            if minmaxsize['max'].filesize < record.files['bam'].filesize:
                minmaxsize['max'] = record.files['bam']
            if not record.files['bam'].filesize:
                util.log_info(
                    log, 'no file size: %s--%s' %
                    (record.write(), record.files['bam'].write()))

    statistics(log, filename2cghubRecords, minmaxsize, verbose)
    util.log_info(
        log,
        'finished reading cghub archive.  %s total records, %s duplicates' %
        (count, dupcount))
    return filename2cghubRecords.values(), minmaxsize, archives
Example #26
0
def statistics(log, filename2cghubRecords, minmaxsize, verbose):
    states = {}
    centers = {}
    studies = {}
    sampleIDs = [{}, {}, {}, {}, {}, {}, {}, {}]
    diseases = {}
    analyte_codes = {}
    sample_types = {}
    strategies = {}
    platforms = {}
    refassems = {}
    models = {}
    for record in filename2cghubRecords.itervalues():
        states[record.state] = states.setdefault(record.state, 0) + 1
        centers[record.center_name] = centers.setdefault(
            record.center_name, 0) + 1
        studies[record.study] = studies.setdefault(record.study, 0) + 1
        diseases[record.disease_abbr] = diseases.setdefault(
            record.disease_abbr, 0) + 1
        analyte_codes[record.analyte_code] = analyte_codes.setdefault(
            record.analyte_code, 0) + 1
        sample_types[record.sample_type] = sample_types.setdefault(
            record.sample_type, 0) + 1
        strategies[record.library_strategy] = strategies.setdefault(
            record.library_strategy, 0) + 1
        platforms[record.platform] = platforms.setdefault(record.platform,
                                                          0) + 1
        refassems[record.refassem_short_name] = refassems.setdefault(
            record.refassem_short_name, 0) + 1
        models[record.platform_full_name] = models.setdefault(
            record.platform_full_name, 0) + 1

        try:
            fields = record.legacy_sample_id.split('-')
            for index, field in enumerate(fields[:-3]):
                sampleIDs[index][field] = sampleIDs[index].setdefault(
                    field, 0) + 1
            for index, field in enumerate(fields[-3:]):
                sampleIDs[index + len(fields[:-3])][field] = sampleIDs[
                    index + len(fields[:-3])].setdefault(field, 0) + 1
        except:
            util.log_exception(
                log, 'problem splitting %s(%s:%s)' %
                (record.legacy_sample_id, index, field))

    util.log_info(log, '\nStates')
    count = 0
    for state, value in states.iteritems():
        if count < 15:
            count += 1
            util.log_info(log, '%s: %s' % (state, value))
        else:
            util.log_info(log, '\t(of %s)' % (len(states.keys())))
            break
    util.log_info(log, '')

    util.log_info(log, 'Centers')
    count = 0
    for center, value in centers.iteritems():
        if count < 15:
            count += 1
            util.log_info(log, '%s: %s' % (center, value))
        else:
            util.log_info(log, '	(of %s)' % (len(centers.keys())))
            break
    util.log_info(log, '')

    util.log_info(log, 'Studies')
    count = 0
    for studie, value in studies.iteritems():
        if count < 15:
            count += 1
            util.log_info(log, '%s: %s' % (studie, value))
        else:
            util.log_info(log, '(of %s)' % (len(studies.keys())))
            break
    util.log_info(log, '')

    if verbose:
        util.log_info(log, 'Sample ids:')
        count = 0
        for sampleMap in sampleIDs:
            util.log_info(log, 'next part:')
            for sampleID, value in sampleMap.iteritems():
                if count < 15:
                    count += 1
                    util.log_info(log, '%s: %s' % (sampleID, value))
                else:
                    util.log_info(log, '(of %s)' % (len(sampleMap.keys())))
                    break
            util.log_info(log, '')
            count = 0

    util.log_info(log, 'Diseases:')
    count = 0
    for disease, value in diseases.iteritems():
        count += 1
        util.log_info(log, '%s: %s' % (disease, value))
    util.log_info(log, '')

    util.log_info(log, 'Analyte codes:')
    count = 0
    for analyte_code, value in analyte_codes.iteritems():
        if count < 15:
            count += 1
            util.log_info(log, '%s: %s' % (analyte_code, value))
        else:
            util.log_info(log, '(of %s)' % (len(analyte_codes.keys())))
            break
    util.log_info(log, '')

    util.log_info(log, 'Sample types')
    count = 0
    for sample_type, value in sample_types.iteritems():
        if count < 15:
            count += 1
            util.log_info(log, '%s: %s' % (sample_type, value))
        else:
            util.log_info(log, '(of %s)' % (len(sample_types.keys())))
            break
    util.log_info(log, '')

    util.log_info(log, 'Strategies:')
    count = 0
    for strategie, value in strategies.iteritems():
        if count < 15:
            count += 1
            util.log_info(log, '%s: %s' % (strategie, value))
        else:
            util.log_info(log, '(of %s)' % (len(strategies.keys())))
            break
    util.log_info(log, '')

    util.log_info(log, 'Platforms:')
    count = 0
    for platform, value in platforms.iteritems():
        if count < 15:
            count += 1
            util.log_info(log, '%s: %s' % (platform, value))
        else:
            util.log_info(log, '(of %s)' % (len(platforms.keys())))
            break
    util.log_info(log, '')

    util.log_info(log, 'Reference Assembles:')
    count = 0
    for refassem, value in refassems.iteritems():
        if count < 15:
            count += 1
            util.log_info(log, '%s: %s' % (refassem, value))
        else:
            util.log_info(log, '(of %s)' % (len(refassems.keys())))
            break
    util.log_info(log, '')

    util.log_info(log, 'Models:')
    count = 0
    for model, value in models.iteritems():
        if count < 15:
            count += 1
            util.log_info(log, '%s: %s' % (model, value))
        else:
            util.log_info(log, '(of %s)' % (len(models.keys())))
            break

    util.log_info(log, '')
    util.log_info(
        log, '\n\t\tmax: %s\n\t\tmin: %s' %
        (minmaxsize['max'].write(), minmaxsize['min'].write()))
def byte_ngram(files_list, addrlength=32, n=1):
    dicts_list = []
    total_files = len(files_list)
    bad_files_names = []
    for idx, file_name in enumerate(files_list):
        bytes_file = DATASET_DIR + file_name + '.bytes.gz'
        try:
            with gzip.open(bytes_file, 'rt') as fp:
                bytedict = {}
                hex_seq = ""
                for line in fp.readlines():
                    if not line.strip():
                        continue
                    else:
                        address = int(addrlength / 4)  # hex to bytes
                        # ensure that addresses values will not be counted
                        # in the ngram calculation
                        hex_seq = hex_seq + line[address:].strip()

                hex_seq = hex_seq.replace(" ", "")
                for i in range(0, len(hex_seq) - 1, 2):
                    # ignore bytes that contain the "?" character
                    if hex_seq[i] == "?" or hex_seq[i + 1] == "?":
                        continue
                    if 2 * n + i > len(hex_seq):
                        break

                    gram = hex_seq[i:(2 * n + i)]
                    if gram not in bytedict.keys():
                        bytedict[gram] = 1
                    else:
                        bytedict[gram] += 1

                dicts_list.append(bytedict)
        except Exception as e:
            bad_files_names.append(file_name)
            log_exception(e, sys.argv[0], bytes_file)

        # progress bars always save my sanity
        progress_bar(idx + 1, total_files, 50)

    # log the corrupted files for future reference
    if len(bad_files_names) > 0:
        with open('bad_bytes_files.txt', 'w') as bfp:
            for name in bad_files_names:
                bfp.write(name + '.bytes\n')

    # convert list of dictionaries to a byte ngram count numpy array
    vec = DictVectorizer()
    ngram_freq = vec.fit_transform(dicts_list).toarray()
    ngram_freq_df = pd.DataFrame(ngram_freq, columns=vec.get_feature_names())
    # store frequency of each byte ngram
    ngram_freq_df.to_csv('features/' + str(n) + 'gram_byte_freq.csv')
    save_obj(ngram_freq_df, str(n) + 'gram_byte_freq')

    # transform ngram frequency array to ngram tfidf array
    transformer = TfidfTransformer(smooth_idf=False)
    ngram_tfidf = transformer.fit_transform(ngram_freq)
    # store tfidf of each byte ngram
    ngram_tfidf_df = pd.DataFrame(ngram_tfidf.todense(),
                                  columns=vec.get_feature_names())
    ngram_tfidf_df.to_csv('features/' + str(n) + 'gram_byte_tfidf.csv')
    save_obj(ngram_tfidf_df, str(n) + 'gram_byte_tfidf')
    return ngram_tfidf_df
 def _set_master(self):
     if not self._ctx.node_list:
         util.log_exception(sys.exc_info(), "Unable to set a master for the node")
         self.shutdown()
     self._assign_master(self._ctx.node_list[0])
Example #29
0
    def check_node_still_dead(self, node):
        print(" **************************ENTER check_node_still_dead")
        # ogger = logging.getLogger('nodechecker.ncv')
        self.logger.debug('*****************************ENTER check_node_still_dead')
        # global active_node_list
        # global new_dead_node_set
        # global dead_node_set
        # global min_time_diff

        # now = time.mktime(time.localtime())
        self.logger.debug("hm root" + self._ctx.conf.hm_root)
        #path = os.path.join(self._ctx.conf.hm_root, self._
        # ctx.conf.collectd_home, self._ctx.conf.collectd_rrd_dir, n.hostname)

        path = os.path.join(self._ctx.conf.hm_root, self._ctx.conf.collectd_home, self._ctx.conf.collectd_rrd_dir,
                            node.hostname)

        path1 = os.path.join(self._ctx.conf.hm_root, self._ctx.conf.collectd_home, self._ctx.conf.collectd_rrd_dir)
        path2 = os.path.join(self._ctx.conf.hm_root, self._ctx.conf.collectd_home)
        #        path3= os.path.join(self._ctx.conf.hm_root, self._ctx.conf.collectd_home, self.
        # _ctx.conf.collectd_rrd_dir, node.hostname)
        self.logger.debug("path1" + path1)
        self.logger.debug("path2" + path2)

        self._ctx.resource_lock.acquire()
        try:
            self._ctx.min_time_diff = -1
            self.logger.debug("path" + path)
            os.path.walk(path, self.find_minimal_rrd_timestamp, [time.mktime(time.localtime())])
            diff = self._ctx.min_time_diff
            self.logger.debug("dif:" + str(diff))
            self.logger.debug("timeout" + str(self._ctx.dead_node_timeout))
            if 0 < diff < self._ctx.dead_node_timeout:
                self.logger.debug('check_node_still_dead() node is alive: Diff < self._ctx.dead_node_timeout')
                pass
            else:
                self.logger.debug('check_node_still_dead() node is dead' + node.ip_address)
                if node in self._ctx.active_node_list:
                    self._ctx.active_node_list.remove(node)
                else:
                    print("this is strange")
                self._ctx.dead_node_set.add(node.ip_address)
                util.send(self._ctx.this_node,
                          self._ctx.node_list,
                          util.json_from_list(
                              self._ctx.active_node_list, 'active_node_list'))

                self.logger.debug("process node status alerts...")
                self._ctx.ntf_manager.process_node_status_alerts([node], "DEAD_NODE")

                self.logger.debug("storing list to file...")
                util.store_list_to_file(
                    self._ctx.active_node_list, self._ctx.active_node_list_file, self._ctx.this_node.group_name)

                #self.logger.debug("removing node from set of new dead...")
                #self._ctx.new_dead_node_set.remove(node.ip_address)
        except:
            self.logger.debug("Star 3")
            util.log_exception(sys.exc_info())
            print(" EXCEPTION in check_node_still_dead" + str(sys.exc_info()))
            pass
        finally:
            self._ctx.new_dead_node_set.remove(node.ip_address)
            #self.logger.debug("self._ctx.new_dead_node_set.remove" + str(len(self._ctx.new_dead_node_set)))
            self._ctx.resource_lock.release()
        print(" exit check_node_still_dead")