Example #1
0
  def binarize_tree(self, next_id):
    oid = next_id
    tree = self.parse
    amr = self.amr

    # handle all-terminal rules
    if not any(s[0] == '#' for s in tree.leaves()):
      return [VoRule(next_id, self.symbol, self.weight, self.amr, self.parse,
        self.rhs1_visit_order, self.rhs2_visit_order)], next_id + 1

    # handle rules containing nonterminals
    rules = []
    try:
      tree, amr, at_rules, next_id = self.collapse_amr_terminals(tree, amr,
          next_id)
      rules += at_rules

      tree, amr, ts_rules, next_id = self.merge_tree_symbols(tree, amr, next_id)
      rules += ts_rules
    except BinarizationException:
      log.warn('Unbinarizable rule!')
      return None, oid

    # sanity check as above
    assert isinstance(tree, str)
    assert len(amr.triples()) == 1
    rules.append(VoRule(next_id + 1, self.symbol, self.weight, amr, tree))
    return rules, next_id + 2
Example #2
0
  def binarize(self, next_id):
    oid = next_id
    tree = self.parse
    amr = self.amr

    # handle all-terminal rules
    if not any(s[0] == '#' for s in tree.leaves()):
      return [VoRule(next_id, self.symbol, self.weight, self.amr, self.parse,
        self.rhs1_visit_order, self.rhs2_visit_order)], next_id + 1

    # handle rules containing nonterminals
    rules = []
    try:
      tree, amr, at_rules, next_id = self.collapse_amr_terminals(tree, amr,
          next_id)
      rules += at_rules

      string = tree.leaves()

      string, amr, st_rules, next_id = self.collapse_string_terminals(string,
          amr, next_id)
      rules += st_rules

      string, amr, nt_rules, next_id = self.merge_string_nonterminals(string,
          amr, next_id)
      rules += nt_rules
    except BinarizationException:
      log.warn('Unbinarizable rule!')
      return None, oid
    
    # sanity check---did we completely binarize the rule?
    assert len(string) == 1
    assert len(amr.triples()) == 1
    rules.append(VoRule(next_id + 1, self.symbol, self.weight, amr, string[0]))
    return rules, next_id + 2
 def backup_redo_log(self, save_path):
     dir_log_source = 'SYSTEMDB' if self.target_db.upper(
     ) == 'SYSTEMDB' else 'DB_{}'.format(self.target_db.upper())
     default_log_path = '{}/backup/log/{}'.format(self.dir_instance,
                                                  dir_log_source)
     cp_redo_log = "su - {} -c 'cp {}/* {}'".format(self.hana_adm,
                                                    default_log_path,
                                                    save_path)
     try_times = 3
     try:
         while try_times <= 3:
             result = exec_cmd2(cp_redo_log)
             status = result['ret']
             output = result['msg'].strip()
             if status == 0:
                 return True
             else:
                 log.warn('[TASK_ID:' + str(self.task_id) +
                          '] backup_redo_log cmd fail! dump_cmd:' +
                          cp_redo_log + 'status:' + str(status) +
                          ' output:' + output)
                 exec_cmd2('rm -rf {}/log_*'.format(self.backup_dir))
         return False
     except Exception as ex:
         error()
         log.error('[backup_redo_log]' + str(type(ex)) + ":" + str(ex))
         return False
Example #4
0
 def break_clusters(self, clusters, *args, **kwargs):
     log.debug('Breaking clusters with:\n{}'.format(str(self)))
     result = []
     for i, cluster in enumerate(clusters):
         if self.to_break(cluster):
             try:
                 sub_clusters = self.break_cluster(cluster, *args, **kwargs)
                 if not sub_clusters:
                     log.warn('Cluster {} not broken'.format(cluster.id))
                     result.append(cluster)
                 else:
                     log.info(
                         'Breaking cluster {} into {} sub_clusters'.format(
                             cluster.id, len(sub_clusters)))
                     result.extend(sub_clusters)
             except (lpinterface.NoSolutionsError, UnboundLocalError,
                     TypeError, ValueError) as e:
                 log.error(
                     'Cluster breaking failed for cluster {} - see log'.
                     format(cluster.id))
                 log.debug(sys.exc_info())
                 result.append(cluster)
         else:
             result.append(cluster)
     return result
Example #5
0
def iterate_fasta(
    filename='../database/V-QUEST-reference-allele-db+no-period-references.clustalw.fasta'
):
    def strip_fasta_ID(
            s):  # strips out allele name from the IMGT fasta naming convention
        s = s.split('|')
        return (s[1], s[3], s[0])  # (name, functional_value, accession)

    if not os.path.exists(filename):
        log.info('Fetching IMGT V gene reference nucleotide sequences')
        if not fetch_reference(
                'IMGTGENEDB-ReferenceSequences.fasta-nt-WithGaps-F+ORF+inframeP'
        ):
            log.error(
                'Unable to find/fetch from IMGT V gene nucleotide references')
            os.sys.exit()

    consensus = ''
    current_gene = None

    return_db = OrderedDict()
    consensus = get_consensus().lower().replace('-', '.')

    for record in SeqIO.parse(filename, 'fasta'):
        allele, functional, accession = strip_fasta_ID(record.description)

        # if allele != 'IGHV1-18*01':
        # 	continue

        # log.debug('\nAllele {}'.format(allele))

        seq = str(record.seq).lower().replace('-', '.')
        # log.debug('Allele: '+ allele)

        if not current_gene or allele.split(
                '*')[0] != current_gene:  # first entry in iteration
            current_gene = allele.split('*')[0]
            # return_db[current_gene] = OrderedDict({'alleles': OrderedDict()})
        # log.debug('allele seq:\n{}\nConsensus seq:\n{}'.format(seq, consensus))
        length = len(seq.replace('.', ''))
        variants, seq, msg = call_variants.get_variants(seq, consensus)
        # log.debug('Has {} variants'.format(len(variants)))
        if msg:
            log.warn('\nAllele {}'.format(allele))
            common.log_msg(msg)

        return_db[allele] = OrderedDict({
            'imgt_accession': accession,
            'functional': functional,
            'seq': seq,
            'length': length
        })
        if variants:
            # log.debug(variants)
            if __name__ != '__main__':
                variants = sets.Set([(x['pos'], x['op']) for x in variants])
            return_db[allele]['variants'] = variants
    # remove_IMGT_periods(return_db[current_gene]['alleles'][allele])

    return return_db
Example #6
0
def parse_karolinska_html(page, func):
   soup = BeautifulSoup(page, 'lxml')
   data = autodict()
   dbsnp = collections.defaultdict(lambda: '*')
   
   tables = soup.select('table table')
   if len(tables) == 0 or True:
      tables = soup.select('table')
   for table in tables:
      for row in table.find_all('tr'): # skip header
         items = func(row, dbsnp)
         if len(items) < 3 or len(filter(None, items[2])) == 0 or items[0] == 'Allele' or items[0] == '': 
            continue
         items[0] = items[0].split()[0]
         if 'X' in items[0] or 'x' in items[0]:
            log.warn('Karolinska: Ignoring {}', items[0])
            continue 
         if items[0] in data:
            log.warn('Karolinska: {} already exists, overwriting it', items[0])
            log.debug('Karolinska: overwriting {} with {}', items[0], ','.join(map(str, items[2])))
         data[items[0]].update({
            'mutations': items[2],
            'phenotype': {
               'invivo': items[-3],
               'invitro': items[-2],
            }
         })
   return data
Example #7
0
    def check_host_groups(self):
        """
        This method checks if some host group exists

        """
        for item in self.group_list:
            tenant_name = item[0]
            payload = {
                "jsonrpc": "2.0",
                "method": "hostgroup.exists",
                "params": {
                    "name": tenant_name
                },
                "auth": self.api_auth,
                "id": 1
            }
            response = self.contact_zabbix_server(payload)
            if response['result'] is False:
                log.warn("Host Group %s does not existed, Creating ... " %tenant_name)
                payload = {"jsonrpc": "2.0",
                           "method": "hostgroup.create",
                           "params": {"name": tenant_name},
                           "auth": self.api_auth,
                           "id": 2}
                self.contact_zabbix_server(payload)
            else:
                log.info("Host Group %s has already existed ..." %tenant_name)
Example #8
0
def process_mut_helper(m, dbsnp):
    result = []
    rs = re.findall(r'\s*?([-0-9_]+)\s*?((ins|del|dup|[ACGT])(\S*))',
                    ' ' + m)  # Match 100C>T, -100C>T, -100 C>T etc.
    if len(rs) == 0:
        return None
    for r in rs:
        pos, op = int(r[0].split('_')[0]), r[1].split()[0]
        op = op.replace('*', '')
        if len(op) == 1:  # e.g. 51A
            op = 'A>{}'.format(op)
        elif '>' in op and len(op) != 3:  # e.g. -1601_-1600GA>TT;
            op = op.split('>')
            assert (len(op) == 2)
            if len(op[0]) == len(op[1]):
                result += [[
                    pos + i, '{}>{}'.format(op[0][i], op[1][i]), dbsnp
                ] for i in xrange(len(op[1])) if op[0][i] != op[1][i]]
                continue
            else:  # e.g. 3030G>G/A
                if '/' in op[1] and len(op[0]) == 1:
                    result += [[pos, '{}>{}'.format(op[0], c), dbsnp]
                               for c in op[1].split('/') if op[0] != c]
                else:
                    log.warn('Main: Ignoring {}', m)
                continue
        if op[:3] == 'dup':
            op = 'ins' + op[3:]
        if op[-2:] == 'x2':  # detect ins<something>x2
            op = op[:-2] + op[3:-2]
        result.append([pos, op, dbsnp])
    return result
Example #9
0
 def test_scip(name):
     try:
         model = SCIP(name)
         log.warn('Using SCIP')
     except ImportError:
         log.warn(
             'SCIP not found. Please install SCIP and pyscipopt Python package.'
         )
         model = None
     return model
Example #10
0
 def test_gurobi(name):
     try:
         model = Gurobi(name)
         log.warn('Using Gurobi')
     except ImportError:
         log.warn(
             'Gurobi not found. Please install Gurobi and gurobipy Python package.'
         )
         model = None
     return model
Example #11
0
    def em_step(self,
                corpus,
                parser_class,
                normalization_groups,
                bitext=False):
        """ 
        Perform a single step of EM on the 
        """
        ll = 0.0

        counts = defaultdict(float)

        parser = parser_class(self)
        if bitext:
            if parser_class == ParserTD:
                log.err(
                    "Bigraph parsing with tree decomposition based parser is not yet implemented. Use '-p basic'."
                )
                sys.exit(1)
            parse_generator = parser.parse_bitexts(corpus)
        else:
            if self.rhs1_type == "string":
                if parser_class == ParserTD:
                    log.err(
                        "Parser class needs to be 'basic' to parse strings.")
                    sys.exit(1)
                else:
                    parse_generator = parser.parse_strings(corpus)
            else:
                parse_generator = parser.parse_graphs(corpus)

        i = 0
        for chart in parse_generator:
            i += 1
            if not chart:
                log.warn("No parse for sentence %d." % i)
                continue
            inside_probs = chart.inside_scores()
            outside_probs = chart.outside_scores(inside_probs)
            ll += inside_probs["START"]
            counts_for_graph = chart.expected_rule_counts(
                inside_probs, outside_probs)
            for r in counts_for_graph:
                counts[r] = counts[r] + counts_for_graph[r]

        for r in counts:
            if r in counts:
                self[r].weight = counts[r]
            else:
                self[r].weight = LOGZERO

        self.normalize_by_groups(normalization_groups)

        return ll
Example #12
0
    def warn_candidates(self, cluster):
        from common import get_columns
        ground_truth = [x.id.split('_')[0] for x in cluster]

        count = get_columns(Counter(ground_truth).most_common())

        missing = set(ground_truth).difference(
            set([x.id for x in cluster.candidates]))
        if missing:
            log.warn(
                '---\nCluster {}\nMissing {} from candidates\nGround Truth:\n{}'
                .format(cluster.id, str(missing), count))
Example #13
0
    def __init__(self, servers, **runner_args):
        self.servers = ' '.join(servers).split(' ')
        self.runner_args = runner_args

        for command in commands:

            if hasattr(self, command.module_name):
                log.warn('{} conflicts with existing attribute'.format(
                    command.name))
                continue

            run = lambda **arguments: command.execute(self.servers, arguments)

            setattr(self, command.module_name, run)
Example #14
0
File: api.py Project: href/suitable
    def __init__(self, servers, **runner_args):
        self.servers = ' '.join(servers).split(' ')
        self.runner_args = runner_args

        for command in commands:

            if hasattr(self, command.module_name):
                log.warn('{} conflicts with existing attribute'.format(
                    command.name
                ))
                continue

            run = lambda **arguments: command.execute(self.servers, arguments)

            setattr(self, command.module_name, run)
Example #15
0
 def get_item_data(self,sample):
     #print "Pool Gthead Num:",self.pool.running()
     item_data = {'host':'','key':'error','value':''}
     try:
         resource_id = sample['resource_id']
         counter_name = sample['counter_name']
         counter_volume = sample['counter_volume']
         #got resource_id
         if resource_id.split('-')[0] == 'instance':
             resource_id = sample['resource_metadata']['instance_id']
         #got item key
         counter_name = transfer_item_key(counter_name)
         if counter_name in self.moniter_items:
             item_data = {'host':resource_id,'key':counter_name,'value':counter_volume}
     except Exception,e:
         log.warn(str(e))
Example #16
0
    def ceilometer_callback(self, ch, method, properties, body):
        """
        Method used by method ceilometer_amq() to filter messages by type of message.

        :param ch: refers to the head of the protocol
        :param method: refers to the method used in callback
        :param properties: refers to the proprieties of the message
        :param body: refers to the message transmitted
        """
        payload = json.loads(body)
        try:
            message_body = json.loads(payload["oslo.message"])
            samples = message_body["args"]["data"]
            # print "--------------------------------------------------"
            self.pool.spawn_n(self.zabbix_sender.consume_samples, samples)
        except Exception, e:
            log.warn(str(e))
Example #17
0
    def em_step(self, corpus, parser_class, normalization_groups, bitext = False):
        """ 
        Perform a single step of EM on the 
        """
        ll = 0.0

        counts = defaultdict(float)

        parser = parser_class(self)
        if bitext: 
            if parser_class == ParserTD:
                log.err("Bigraph parsing with tree decomposition based parser is not yet implemented. Use '-p basic'.")
                sys.exit(1)
            parse_generator = parser.parse_bitexts(corpus)
        else: 
            if self.rhs1_type == "string":
                if parser_class == ParserTD:
                    log.err("Parser class needs to be 'basic' to parse strings.")
                    sys.exit(1)
                else: 
                    parse_generator = parser.parse_strings(corpus)
            else: 
                parse_generator = parser.parse_graphs(corpus)
        
        i = 0
        for chart in parse_generator:
            i += 1   
            if not chart: 
                log.warn("No parse for sentence %d." % i)
                continue 
            inside_probs = chart.inside_scores()
            outside_probs = chart.outside_scores(inside_probs)
            ll += inside_probs["START"]
            counts_for_graph = chart.expected_rule_counts(inside_probs, outside_probs)
            for r in counts_for_graph:
                counts[r] = counts[r] + counts_for_graph[r]
      
        for r in counts: 
            if r in counts: 
                self[r].weight = counts[r]
            else: 
                self[r].weight = LOGZERO 
       
        self.normalize_by_groups(normalization_groups) 

        return ll 
Example #18
0
def set_lsf(cfg):
    """
    Set LSF specific :py:data:`~lrms.common.Config` attributes.

    :param cfg: parsed arc.conf
    :type cfg: :py:class:`ConfigParser.ConfigParser`
    """

    Config.lsf_bin_path = str(cfg.get('common', 'lsf_bin_path')).strip('"') if cfg.has_option('common', 'lsf_bin_path') else '/usr/bin'

    if cfg.has_option('common', 'lsf_profile_path'):
        Config.lsf_setup = 'source %s &&' % str(cfg.get('common', 'lsf_profile_path')).strip('"')
    else:
        warn('lsf_profile_path not set in arc.conf', 'lsf')
        Config.lsf_setup = ''

    Config.localtransfer = False
    Config.lsf_architecture = str(cfg.get('common', 'lsf_architecture')).strip('"') if cfg.has_option('common', 'lsf_architecture') else ''
Example #19
0
    def fix_candidate_rev_comp(self, cluster):
        mappings = self.minimap.run(
            [SeqRecord('cons', cluster.consensus.replace('.', ''))],
            cluster.candidates,
            params=self.minimap.params +
            ' -N{}'.format(len(cluster.candidates)))

        mappings = dict([(m.tName, m) for m in mappings])

        for c in cluster.candidates:
            try:
                if mappings[c.id].strand == '-':
                    c.seq = str(Seq(c.seq).reverse_complement())
                    log.debug('Reverse complemented candidate {}'.format(c.id))
            except KeyError:
                log.warn(
                    'Breaking cluster {}: candidate {} not in rev-compl mapping. Read support: {}'
                    .format(cluster.id, c.id, c.read_mapping_support))
 def hana_db_backup(self):
     try:
         config = getconf()
         try_times = 0
         save_path = os.path.join(self.backup_dir, self.target_db)
         if self.backup_mode == config.DB_BACKUP_TYPE_FULL:
             self.full_backup_clear_old_log()
             backup_command = r"\"backup data for {} using file ('{}/full')\"".format(
                 self.target_db, save_path)
         else:
             backup_command = r"\"backup data DIFFERENTIAL for {} using file ('{}/diff')\"".format(
                 self.target_db, save_path)
         exec_command = self.system_db_exec_command_str(backup_command)
         exec_command_log = exec_command.replace(
             r'-p \"{}\"'.format(self.system_db_pwd), '-p ******')
         log.info('backup cmd is {}:'.format(exec_command_log))
         while try_times < 3:
             log.debug('[TASK_ID:' + str(self.task_id) +
                       '] hana_db_backup cmd execute. cmd:' +
                       exec_command_log + '')
             result = exec_cmd2(exec_command)
             status = result['ret']
             output = result['msg'].strip()
             log.debug('[TASK_ID:' + str(self.task_id) +
                       '] hana_db_backup cmd finish! status:' +
                       str(status) + ' output:' + output)
             try_times += 1
             if status != 0:
                 log.warn('[TASK_ID:' + str(self.task_id) +
                          '] hana_db_backup cmd fail! dump_cmd:' +
                          exec_command_log + 'status:' + str(status) +
                          ' output:' + output)
                 continue
             if self.backup_mode == config.DB_BACKUP_TYPE_FULL:
                 self.gen_hana_fullback_info_file()
                 return True
             if self.backup_mode == config.DB_BACKUP_TYPE_DIFF:
                 if self.backup_redo_log(save_path):
                     return True
         return False
     except Exception as ex:
         error()
         log.error('[HANA_DB_BACKUP]' + str(type(ex)) + ":" + str(ex))
         return False
Example #21
0
    def create_host(self, instance_name, instance_id, tenant_name):

        """
        Method used to create a host in Zabbix server

        :param instance_name: refers to the instance name
        :param instance_id:   refers to the instance id
        :param tenant_name:   refers to the tenant name
        """
        group_id = self.find_group_id(tenant_name)

        if not instance_id in instance_name:
            instance_name = self.zabbix_proxy_name + '_1_' + instance_id
        log.warn("VM Instance %s does not existed, Creating ... " %instance_name)
        payload = {"jsonrpc": "2.0",
                   "method": "host.create",
                   "params": {
                       "host": instance_id,
                       "name": instance_name,
                       "proxy_hostid": self.proxy_id,
                       "interfaces": [
                           {
                               "type": 1,
                               "main": 1,
                               "useip": 1,
                               "ip": "127.0.0.1",
                               "dns": "",
                               "port": "10050"}
                       ],
                       "groups": [
                           {
                               "groupid": group_id
                           }
                       ],
                       "templates": [
                           {
                               "templateid": self.template_id
                           }
                       ],

                   },
                   "auth": self.api_auth,
                   "id": 1}
        self.contact_zabbix_server(payload)
Example #22
0
def get_rates(product, start_dt, end_dt, sec_per_tick):
    """
    Returns the rates with the schema
        [unix time, low, high, open, close, volume]
    for the specified product and time interval.
    """
    cur_end = end_dt

    all_rates = []
    while cur_end > start_dt:
        cur_start = cur_end - timedelta(seconds=sec_per_tick * max_ticks)

        # We want to stop once we reach the initial start datetime.
        if cur_start < start_dt:
            cur_start = start_dt

        rates, resp = marketdata.get_rates(product,
                                           start_dt=cur_start,
                                           end_dt=cur_end,
                                           sec_per_tick=sec_per_tick)

        if resp.status_code != 200:
            # Rate limit, wait another fetch delay
            if resp.status_code == 429:
                log.warn('429 Too Many Requests, waiting for {}s to retry...'.
                         format(_fetch_delay))
                time.sleep(_fetch_delay)
                continue

            log.error('non-200 status code when SCRAPING HISTORICAL RATES')
            log.error('status code: ' + str(resp.status_code))
            log.error('reason: ' + resp.reason)
            log.error('message: ' + resp.text)
            break

        all_rates.extend(rates)

        # Update cur_end for the next retrieval
        fetched_start = datetime.utcfromtimestamp(rates[-1][0])
        cur_end = fetched_start - timedelta(seconds=sec_per_tick)
        time.sleep(_fetch_delay)

    return all_rates
Example #23
0
    def get_template_id(self):
        """
        Method used to check if the template already exists. If not, creates one

        :return: returns the template ID
        """
        global template_id
        payload = { 
            "jsonrpc": "2.0",
            "method": "template.exists",
            "params": {
                "host": self.template_name
            },
            "auth": self.api_auth,
            "id": 1
        }
        log.info("Getting Platform VM's Template id ....")
        response = self.contact_zabbix_server(payload)
        if response['result'] is True:
            payload = {"jsonrpc": "2.0",
                       "method": "template.get",
                       "params": {
                           "output": "extend",
                           "filter": {
                               "host": [
                                   self.template_name
                               ]
                           }
                       },
                       "auth": self.api_auth,
                       "id": 1
            }
            response = self.contact_zabbix_server(payload)
            global template_id
            for item in response['result']:
                template_id = item['templateid']
                log.info("Template exists ....\n Template id:%s" %template_id)
        else:
            log.warn("Template does not exist!!! Creating...")
            group_id = self.get_group_template_id()
            template_id = self.create_template(group_id)
        return template_id
    def parse_poa_pir(self, data, cluster=None):

        output = list(SeqIO.parse(data, 'fasta'))
        consensus = [x for x in output if 'CONSENS' in x.id]
        msa = dict([(x.id, str(x.seq)) for x in output
                    if 'CONSENS' not in x.id])

        try:
            cluster.msa = []
            # check PIR contains same reads as cluster
            if len(
                    set([c.id for c in cluster]).symmetric_difference(
                        msa.keys())) > 0:
                raise IOError
            # assign MSA sequences to reads
            for read in cluster:
                read.alignment = str(msa[read.id])
                cluster.msa.append(read.alignment)
        except AttributeError as e:
            pass

        if len(consensus) > 1:
            try:
                log.warn(
                    '{} generated alignment for cluster {} returned {} consensus sequences - using only first sequence:'
                    .format(self.name, cluster.id, len(consensus)))
            except AttributeError as e:
                log.warn(
                    '{} generated alignment returned {} consensus sequences - using only first sequence'
                    .format(self.name, len(consensus)))
            log.debug('\n'.join([x.id for x in consensus]))

        if consensus:
            consensus = str(consensus[0].seq)

        try:
            cluster.consensus = str(consensus)
        except AttributeError:
            pass

        return consensus, msa
Example #25
0
 def get_proxy_id(self):
     """
     Method used to check if the proxy exists.
     :return: a control value and the proxy ID if exists
     """
     payload = {
         "jsonrpc": "2.0",
         "method": "proxy.get",
         "params": {
             "output": "extend"
         },
         "auth": self.api_auth,
         "id": 1
     }
     response = self.contact_zabbix_server(payload)
     proxy_id = None
     log.info("Getting Platform Proxy id ...")
     for item in response['result']:
         if item['host'] == self.zabbix_proxy_name:
             proxy_id = item['proxyid']
             break
     if not proxy_id:
         '''
         Check if proxy exists, if not create one
         '''
         log.warn("Proxy id does not exists, Creating ....")
         payload = {"jsonrpc": "2.0",
                    "method": "proxy.create",
                    "params": {
                        "host": self.zabbix_proxy_name,
                        "status": "5"
                    },
                    "auth": self.api_auth,
                    "id": 1
         }
         response = self.contact_zabbix_server(payload)
         proxy_id = response['result']['proxyids'][0]
     log.info("Proxy id: %s" %proxy_id)
     return proxy_id
Example #26
0
    def load_connections(
        config: dict,
        session: Session = None,
    ):
        connections = config.get("connections", None)
        if connections is None:
            log.info("No connections found, skipping")
            return

        log.info("Loading variabels from config...")
        for key in connections.keys():
            val: dict = connections.get(key)
            if not isinstance(val, dict):
                log.warn(
                    f"Connection {key} skipped. Value must be a dictionary.")

            connection = session.query(Connection).filter_by(
                conn_id=key).first()
            if connection is not None:
                log.info(f"Connection exists, skipping: {key}")
                continue

            log.info("Setting connection: " + key)
            extra = val.get("extra", None)
            if extra is not None and not isinstance(extra, (int, str)):
                extra = json.dumps(extra)

            connection = Connection(
                conn_id=key,
                conn_type=val.get("conn_type", None),
                host=val.get("host", None),
                login=val.get("login", None),
                password=val.get("password", None),
                schema=val.get("schema", None),
                port=val.get("port", None),
                extra=extra,
            )
            session.add(connection)
        session.commit()
Example #27
0
def get_rates(product, start_dt=None, end_dt=None, sec_per_tick=5):
    """
    Returns the list of ticks with the schema
        [unix time, low, high, open, close, volume]
    for the specified DATETIME interval. A maximum of 200 ticks will be returned
    and missing ticks are possible.

    By default it returns the last 1000 seconds of historic data with sec_per_tick = 5.

    Also returns the full response object.
    """

    if end_dt is None:
        end_dt = datetime.utcnow()

    if start_dt is None:
        start_dt = end_dt - timedelta(seconds=max_ticks * sec_per_tick)

    params = {
            'start': start_dt.isoformat(),
            'end': end_dt.isoformat(),
            'granularity': sec_per_tick,
            }

    log.info('getting HISTORIC RATES')
    resp = httpapi.get(
            common.api_url + 'products/' + product + '/candles',
            params=params,
            auth=common.auth,
            )

    rates = resp.json()

    if resp.status_code == 200 and len(rates) > 0:
        fetched_start = datetime.utcfromtimestamp(rates[-1][0])
        fetched_end = datetime.utcfromtimestamp(rates[0][0])

        # TODO(richardwu): For some reason GDAX over-extends and returns
        # values < start_dt.
        # assert(fetched_start >= start_dt)
        if fetched_start < start_dt:
            log.warn('KNOWN BUG: HISTORIC RATES fetched_start < cur_start')
            log.warn('fetched_start: ' + fetched_start.isoformat())
            log.warn('start_dt: ' + start_dt.isoformat())
        assert(fetched_end <= end_dt)

    return rates, resp
Example #28
0
    def call_cluster(self,
                     cluster,
                     filter_function=None,
                     result_filter=None,
                     temp_file_path=None):
        import tempfile

        if len(cluster) == 1:
            log.warn('Cluster {} has single read, not calling'.format(
                cluster.id))
            try:
                cluster.consensus_seq = None
                cluster.consensus_builder = None
                cluster.set_call(None)
                cluster.candidates = None
                cluster.candidates_method = str(self)
            except AttributeError as e:
                pass
            finally:
                return None

        consensus_seq = None
        consensus_seq_id = None
        f = None
        is_cluster_inst = False  # flag for filling descriptive attributes
        if hasattr(cluster, '__getitem__'
                   ):  # assumed to be list of sequences, get consensus
            try:
                if temp_file_path:
                    with open(temp_file_path, 'wb') as f:
                        f.write(
                            fasta_from_seq(*zip(*[(x.id, x.seq)
                                                  for x in cluster])))
                consensus_seq = self.consensus_builder.generate_consensus(
                    temp_file_path if temp_file_path else cluster)
                if not consensus_seq:
                    cluster.consensus = None
                    cluster.candidates_method = str(self)
                    return
                consensus_seq_id = 'cons'
                log.info('Generated consensus with:\n{}'.format(
                    str(self.consensus_builder)))
                log.debug('Output:\n{}'.format(consensus_seq))

                try:
                    cluster.consensus = consensus_seq
                    cluster.consensus_method = str(self.consensus_builder)
                except AttributeError as e:
                    pass
            except TypeError as e:  ## No consensus builder is set
                raise ValueError(
                    'Cluster calling: list of cluster sequences provided but no consensus builder instantiated.'
                )
        else:
            if isinstance(cluster, basestring):  # input is path
                if os.path.exists(cluster):
                    cons_path = cluster
                else:
                    raise ValueError(
                        'Cluster calling input invalid. String provided but is not valid path. If trying to cast as Bio.Seq.Seq-like object'
                    )
            else:  # input is consensus seq
                consensus_seq = cluster.seq
                consensus_seq_id = cluster.id

        ## save blasr target in all cases except path as input
        if consensus_seq:
            try:
                f = open(
                    temp_file_path,
                    'wb+') if temp_file_path else tempfile.NamedTemporaryFile(
                        delete=False)
                f.write(str(fasta_from_seq(consensus_seq_id, consensus_seq)))
                cons_path = f.name
                f.close()
            except AttributeError as e:
                raise ValueError(
                    'Cluster calling input invalid. Provide iterable of cluster sequences, path to cluster consensus or Bio.Seq.Seq-like object to call'
                )

        ## run blasr mapping of consensus_seq against allele database
        command = [self.blasr.src, '', self.allele_db, cons_path]

        try:
            mapping_output = self.blasr.run(*command)
        except ValueError as e:
            log.warn('Blasr returned no mapping')
            try:
                cluster.set_call(None)
                cluster.candidates = None
                cluster.candidates_method = str(self)
            except AttributeError as e:
                pass
            finally:
                return None

        f.close()

        ## select from mapping the desired result as the call
        if not filter_function:
            filter_function = self.filter_function

        try:
            mapping_output = sorted(mapping_output, key=filter_function)
            cluster_call = mapping_output[0]
        except ValueError as e:
            log.error('Invalid blasr mapping value')
            log.debug('\n'.join([str(x) for x in mapping_output]))
            raise e

        if not result_filter:
            result_filter = self.result_filter
        result = result_filter(cluster_call)

        try:
            cluster.set_call([result])
            cluster.candidates = list(mapping_output)
            cluster.candidates_method = str(self)
        except AttributeError as e:
            return result
Example #29
0
def Scan(config, ctr_dirs):
    """
    Query the LSF host for all jobs in /[controldir]/processing with ``bjobs``.
    If the job has stopped running, the exit code is read and the 
    diagnostics and comments files are updated. Finally ``gm-kick`` is executed
    on all jobs with an exit code.

    If the exit code can not be read from the diagnostics file, it will (after
    5 tries) be kicked with status UNKNOWN.

    :param str config: path to arc.conf
    :param ctr_dirs: list of paths to control directories 
    :type ctr_dirs: :py:obj:`list` [ :py:obj:`str` ... ]
    """

    configure(config, set_lsf)
    if Config.scanscriptlog:
        scanlogfile = arc.common.LogFile(Config.scanscriptlog)
        arc.common.Logger_getRootLogger().addDestination(scanlogfile)
        arc.common.Logger_getRootLogger().setThreshold(Config.log_threshold)

    jobs = get_jobs(ctr_dirs)
    if not jobs: return
    if Config.remote_host:
        # NOTE: Assuming 256 B of TCP window needed for each job
        ssh_connect(Config.remote_host, Config.remote_user, Config.private_key, (2 << 7)*len(jobs))

    lsf_bin_path = Config.lsf_bin_path
    execute = excute_local if not Config.remote_host else execute_remote
    args = Config.lsf_setup + ' ' + lsf_bin_path + '/bjobs -w -W ' + ' '.join(jobs.keys()) 
    if os.environ.has_key('__LSF_TEST'):
	handle = execute(args, env = dict(os.environ))
    else:
        handle = execute(args)

    def handle_job(info, in_lsf = True):
        job = jobs[info[0]]
        job.state = info[2]
        if job.state in RUNNING:
            if os.path.exists(job.count_file):
                os.remove(job.count_file)
            return

        if set_exit_code_from_diag(job):
            if in_lsf:
                start, end = info[-2:]
                re_date = re.compile(r'^(?P<mm>\d\d)/(?P<dd>\d\d)-(?P<HH>\d\d):(?P<MM>\d\d)')
                job.LRMSStartTime = arc.common.Time(get_MDS(re_date.match(start).groupdict()))
                if end != '-':
                    job.LRMSEndTime = arc.common.Time(get_MDS(re_date.match(end).groupdict()))
                    job.WallTime = job.LRMSEndTime - job.LRMSStartTime
            # Job finished and exitcode found
            job.message = MESSAGES[job.state]
            return
        # else
        add_failure(job)

    # Handle jobs known to LSF
    for line in handle.stdout[1:]:
        try:
            info = line.strip().split()
            assert(len(info) == 15)
            handle_job(info)
        except Exception as e:
            if line:
                warn('Failed to parse bjobs line: %s\n%s' % (line, str(e)), 'lsf.Scan')

    # Handle jobs lost in LSF
    if handle.returncode != 0:
        debug('Got error code %i from bjobs' % handle.returncode, 'lsf.Scan')
        debug('Error output is:\n' + ''.join(handle.stderr), 'lsf.Scan')
        lost_job = re.compile('Job <(\d+)> is not found')
        for line in handle.stderr:
            match = lost_job.match(line)
            if match:
                handle_job([match.groups()[0], None, 'UNKNOWN'], False)

    kicklist = []
    for job in jobs.itervalues():
        if hasattr(job, 'exitcode'):
            with open(job.lrms_done_file, 'w') as f:
                f.write('%d %s\n' % (job.exitcode, job.message))
            write_comments(job)
            update_diag(job)
            kicklist.append(job)
    gm_kick(kicklist)
Example #30
0
    # Slurm can report StartTime and EndTime in at least these two formats:
    # 2010-02-15T15:30:29 (MDS)
    # 02/15-15:25:15
    # Python does not support duplicate named groups.
    # Have to use separate regex if we want to use named groups.
    #date_MDS = re.compile(r'^(?P<YYYY>\d\d\d\d)-(?P<mm>\d\d)-(?P<dd>\d\d)T(?P<HH>\d\d):(?P<MM>\d\d):(?P<SS>\d\d)$')
    #date_2 = re.compile(r'^(?P<mm>\d\d)/(?P<dd>\d\d)-(?P<HH>\d\d):(?P<MM>\d\d):(?P<SS>\d\d)$')

	date_MDS = re.compile(r'^(?P<YYYY>\d\d\d\d)-(?P<mm>\d\d)-(?P<dd>\d\d) (?P<HH>\d\d):(?P<MM>\d\d):(?P<SS>\d\d)$')
    for line in handle.stdout:
        try:
            localid, state = line.strip().split(':', 1)
        except:
            if line:
                warn('Failed to parse squeue line: ' + line, 'slurm.Scan')
            continue
        job = jobs[localid]
        job.state = state 
        if job.state in ['PENDING','RUNNING','SUSPENDED','COMPLETING']:
            continue

        if not job.state:
            set_exit_code_from_diag(job)
        job.message = MESSAGES.get(job.state, '')

        args = Config.slurm_bin_path + '/scontrol -o show job %s' % localid
        scontrol_handle = execute(args)
        if scontrol_handle.returncode != 0:
            debug('Got error code %i from scontrol' % scontrol_handle.returncode, 'slurm.Scan')
            debug('Error output is:\n' + ''.join(scontrol_handle.stderr), 'slurm.Scan')
Example #31
0
    def cluster(self,
                reads,
                save_input_path=None,
                output_dir=None,
                cached_output=None):
        import shutil

        def get_seq_obj(output):
            seq_mapping = dict([(x.id, x) for x in reads])
            output_seqs = map(lambda x: [seq_mapping[y] for y in x], output)
            return output_seqs

        try:  ## reads are a path
            self.reads = dict([(x.id, x) for x in SeqIO.parse(reads, 'fasta')])
        except AttributeError as e:  ## reads is a list of SeqRecord-like objects
            self.reads = dict([(x.id, x) for x in reads])
        finally:
            if cached_output and self.distance_calculator.matrix:
                self.input_matrix, num_edges, mapping = self.convert_adjacency_matrix(
                    self.distance_calculator.matrix)
                mapping = self.reverse_mappings(mapping, self.reads)
                return [
                    Cluster(x, cluster_id=i, clustering_tool=self)
                    for i, x in enumerate(
                        sorted(self.parse_dsf_output(cached_output, mapping),
                               key=lambda x: len(x),
                               reverse=True))
                ]
            self.distance_calculator.generate_distances(reads)
        try:
            reads.close()
        except AttributeError as e:
            pass

        self.input_matrix, num_edges, mapping = self.convert_adjacency_matrix(
            self.distance_calculator.matrix)
        mapping = self.reverse_mappings(mapping, self.reads)

        log.debug('Number of edges in input graph:' + str(num_edges))

        # write adjacency to file for dsf input
        def writer(matrix, num_edges):
            yield '{} {} 001\n'.format(len(matrix), num_edges)
            for neighbours in matrix:
                line = ' '.join([
                    ' '.join(map(str, (n + 1, w)))
                    for n, w in sorted(neighbours, key=lambda x: x[0])
                ])
                yield '{}\n'.format(line)

        matrix_output_iterator = writer(self.input_matrix, num_edges)
        in_file = tempfile.NamedTemporaryFile(delete=True)

        try:
            if save_input_path:
                in_file = open(save_input_path, 'wb')
        except IOError as e:
            in_file = tempfile.NamedTemporaryFile(delete=True)
            log.warn(
                'Provided DSF input matrix write path not valid, using temporary file'
            )
        log.info('Saving dsf input file to'.format(in_file.name))
        in_file.writelines(matrix_output_iterator)
        in_file.flush()

        # check provided output_dir is valid
        if output_dir:
            if not os.path.exists(output_dir):
                log.warn(
                    'Provied DSF output directory path not valid, using temporary directory'
                )
                output_dir = None
            else:
                temp_dir = None

        # make temp output dir if no valid output dir provided
        if not output_dir:
            temp_dir = tempfile.mkdtemp()
            output_dir = temp_dir
            saved_umask = os.umask(
                0077)  # Ensure the file is read/write by the creator only

        # run DSF
        try:
            output = self.run(self.src, self.params, in_file.name, output_dir,
                              mapping)
            # run dsf
        except Exception as e:  # This is just so the temp files get deleted in the case some previous unhandled exception gets raised
            raise e
        finally:
            if temp_dir:
                os.umask(saved_umask)
                shutil.rmtree(temp_dir)
            in_file.close()

        ## generate instances of cluster_class.Cluster as result
        output = [
            Cluster(x, cluster_id=i, clustering_tool=self)
            for i, x in enumerate(
                sorted(output, key=lambda x: len(x), reverse=True))
        ]
        return output
Example #32
0
    def generate_distances(self,
                           reads=None,
                           minimap=None,
                           filter_func=lambda x: True):
        ## Generates distance matrix of form {read_id }
        ## reads = [Bio.SeqIO, ...] = list of ORIENTED (ie no rev-compl) reads to be clustered (ie containing genes)
        ## if None uses self.reads
        ## minimap = instance of MinimapWrapper object. If none uses pre-set self.minimap
        ## filter_func = distance included in output if threshold(distance) = True
        import copy
        from Bio import SeqIO

        if self.filter_function:
            filter_func = self.filter_function

        if self.matrix:
            log.info('Using cached distance matrix')
            result = self.matrix
            if filter_func:
                result_filtered = self.filter_matrix(
                    copy.deepcopy(self.matrix), filter_func)
                result = result_filtered
            return self.matrix

        if not minimap:
            minimap = self.minimap

        mapping = minimap.ava(reads=reads)

        result = {}

        mapped_reads = set(
        )  # for keeping track of mapped reads to report missing reads
        for i, line in enumerate(mapping):

            mapped_reads.add(line.qName)
            mapped_reads.add(line.tName)

            try:
                if 'NM' not in line.NM:
                    raise IndexError
                NM = int(line.NM.split(':')[2])
            except IndexError as e:
                log.error(
                    'Error in Minimap output: NM field is likely missing\nmapping line:{}'
                    .format('\t'.join(line)))
                log.debug(dir(line))
                log.debug(zip(line.header, line.attributes))
                raise ValueError()

            distance_value = (NM + line.qStart +
                              (line.qLength - line.qEnd) + NM + line.tStart +
                              (line.tLength -
                               line.tEnd)) / float(line.qLength + line.tLength)

            if line.qName not in result:
                result[line.qName] = {}

            result[line.qName][line.tName] = distance_value

        # check if any reads missing from mapping
        missing = set([x.id for x in reads]).difference(mapped_reads)
        if missing:
            log.warn('{} / {} reads missing from mapping'.format(
                len(missing), len(list(reads))))
            log.debug('\n'.join(list(missing)))

        self.matrix = result

        if filter_func:
            result_filtered = self.filter_matrix(copy.deepcopy(result),
                                                 filter_func)

            result = result_filtered

        return result
Example #33
0
def run(args):
    # add lineage profiles/stats
    
    import re
    from ete2 import PhyloTree, NCBITaxa

    # dump tree by default
    if not args.tree and not args.info and not args.descendants:
        args.tree = True
    
    ncbi = NCBITaxa()

    all_taxids = {}
    all_names = set()
    queries = []

    if not args.search:
        log.error('Search terms should be provided (i.e. --search) ')
        sys.exit(-1)
    for n in args.search:
        queries.append(n)
        try:
            all_taxids[int(n)] = None
        except ValueError:
            all_names.add(n.strip())
            
    # translate names
    name2tax = ncbi.get_name_translator(all_names)
    all_taxids.update([(v, None) for v in name2tax.values()])

    not_found_names = all_names - set(name2tax.keys())
    if args.fuzzy and not_found_names:
        log.warn("%s unknown names", len(not_found_names))
        for name in not_found_names:
            # enable extension loading
            tax, realname, sim = ncbi.get_fuzzy_name_translation(name, args.fuzzy)
            if tax:
                all_taxids[tax] = None
                name2tax[name] = tax
                name2realname[name] = realname
                name2score[name] = "Fuzzy:%0.2f" %sim
                
    if not_found_names:
        log.warn("[%s] could not be translated into taxids!" %','.join(not_found_names))
                
    if args.tree:
        if len(all_taxids) == 1:
            target_taxid = all_taxids.keys()[0]
            log.info("Dumping NCBI descendants tree for %s" %(target_taxid))
            t = ncbi.get_descendant_taxa(target_taxid, collapse_subspecies=args.collapse_subspecies, rank_limit=args.rank_limit, return_tree=True)
        else:
            log.info("Dumping NCBI taxonomy of %d taxa..." %(len(all_taxids)))
            t = ncbi.get_topology(all_taxids.keys(),
                              intermediate_nodes=args.full_lineage,
                              rank_limit=args.rank_limit,
                              collapse_subspecies=args.collapse_subspecies)
        
        id2name = ncbi.get_taxid_translator([n.name for n in t.traverse()])        
        for n in t.traverse():
            n.add_features(taxid=n.name)
            n.add_features(sci_name=str(id2name.get(int(n.name), "?")))
            n.name = "%s - %s" %(id2name.get(int(n.name), n.name), n.name)
            lineage = ncbi.get_lineage(n.taxid)
            n.add_features(named_lineage = '|'.join(ncbi.translate_to_names(lineage)))
        dump(t, features=["taxid", "name", "rank", "bgcolor", "sci_name",
                          "collapse_subspecies", "named_lineage"])
    elif args.descendants:
        log.info("Dumping NCBI taxonomy of %d taxa..." %(len(all_taxids)))
        print '# ' + '\t'.join(["Taxid", "Sci.Name", "Rank", "descendant_taxids", "descendant_names"])
        translator = ncbi.get_taxid_translator(all_taxids)
        ranks = ncbi.get_rank(all_taxids)         
        for taxid in all_taxids:
            descendants = ncbi.get_descendant_taxa(taxid, collapse_subspecies=args.collapse_subspecies, rank_limit=args.rank_limit)
            print '\t'.join([str(taxid), translator.get(taxid, taxid), ranks.get(taxid, ''),
                             '|'.join(map(str, descendants)),
                             '|'.join(map(str, ncbi.translate_to_names(descendants)))])
        
    elif args.info:
        print '# ' + '\t'.join(["Taxid", "Sci.Name", "Rank", "Named Lineage", "Taxid Lineage"])
        translator = ncbi.get_taxid_translator(all_taxids)
        
        ranks = ncbi.get_rank(all_taxids) 
        for taxid, name in translator.iteritems():
            lineage = ncbi.get_lineage(taxid)            
            named_lineage = ','.join(ncbi.translate_to_names(lineage))
            lineage_string = ','.join(map(str, lineage))
            print '\t'.join([str(taxid), name, ranks.get(taxid, ''), named_lineage, lineage_string])
Example #34
0
def store_rates(rates, product):
    """
    Rates must be a list of ticks with the schema
        [unix time, low, high, open, close, volume]
    """
    if len(rates) == 0:
        log.warn('no rates to store')
        return

    log.info('storing {} HISTORIC RATES'.format(len(rates)))
    log.info('first rate date: ' +
             datetime.utcfromtimestamp(rates[-1][0]).isoformat())
    log.info('last rate date: ' +
             datetime.utcfromtimestamp(rates[0][0]).isoformat())

    db_params = dict(
        dbname=dbconfig['db_name'],
        user=dbconfig['db_user'],
        host=dbconfig['host'],
        port=dbconfig['port'],
    )

    log.info('connecting to SQL DB')
    log.info('params:')
    log.info(db_params)

    conn = psycopg2.connect(**db_params)

    conn.set_session(autocommit=True)

    log.info('connected to DB.')

    cur = conn.cursor()

    log.info('creating database {} (if necessary).'.format(
        dbconfig['db_name']))

    cur.execute('CREATE DATABASE IF NOT EXISTS {}'.format(dbconfig['db_name']))

    n_cols = 7

    log.info('creating table {} (if necessary).'.format(_rates_tbl))

    cur.execute('''CREATE TABLE IF NOT EXISTS {} (
        product STRING,
        timestamp INT,
        low DECIMAL,
        high DECIMAL,
        open DECIMAL,
        close DECIMAL,
        volume DECIMAL,
        PRIMARY KEY (product, timestamp)
        )'''.format(_rates_tbl))

    # List of rate values
    rate_vals = [[product] + rate for rate in rates]

    assert (len(rate_vals[0]) % 7 == 0)

    # [:-1] is to remove the last comma
    sql_tuple_str = '(' + ('%s,' * n_cols)[:-1] + '),'
    sql_batch_str = (sql_tuple_str * _batch_sz)[:-1]

    log.info('inserting rates into DB...')

    while len(rate_vals) > 0:
        sql_vals_str = sql_batch_str
        if len(rate_vals) < _batch_sz:
            sql_vals_str = (sql_tuple_str * len(rate_vals))[:-1]

        batch = rate_vals[:_batch_sz]
        flatten_vals = [val for rate in batch for val in rate]
        cur.execute('UPSERT INTO {} VALUES '.format(_rates_tbl) + sql_vals_str,
                    flatten_vals)
        log.info('upserted {} rates.'.format(len(batch)))
        rate_vals = rate_vals[_batch_sz:]

    log.info('inserting {} HISTORIC RATES complete.'.format(len(rates)))
    cur.close()
    conn.close()
Example #35
0
        log.err("Output type (-ot) must be either 'forest', 'derivation', or 'derived'.")
        sys.exit(1)
    
    if not args.weight_type in ['prob', 'logprob']:
        log.err("Weight type (-m) must be either 'prob'or 'logprob'.")
        sys.exit(1)

    logprob = (args.weight_type == 'logprob')


    if args.output_type == "forest":
        if not args.output_file:       
            log.err("Need to provide '-o FILE_PREFIX' with output type 'forest'.")
            sys.exit(1)
        if args.k:
            log.warn("Ignoring -k command line option because output type is 'forest'.")    
    
    if not args.parser in ['td', 'basic']:
        log.err("Parser (-p) must be either 'td' or 'basic'.")
        sys.exit(1)
    
    if args.parser != 'td' and args.boundary_nodes: 
        log.warn('The -bn option is only relevant for the tree decomposition parser ("-p td").')

    if args.k > config.maxk:
        log.err("k must be <= than %i (defined in in args.py)." % args.maxk)
        sys.exit(1)

    if args.verbose < 0 or args.verbose > 4:
        log.err("Invalid verbosity level, must be 0-4.")
        sys.exit(1)
Example #36
0
def Scan(config, ctr_dirs):
    """
    Query the SLURM host for all jobs in /[controldir]/processing with ``squeue``.
    If the job has stopped running, more detailed information is fetched with ``scontrol``,
    and the diagnostics and comments files are updated. Finally ``gm-kick`` is executed
    on all jobs with an exit code.

    :param str config: path to arc.conf
    :param ctr_dirs: list of paths to control directories 
    :type ctr_dirs: :py:obj:`list` [ :py:obj:`str` ... ]
    """

    configure(config, set_slurm)
    if Config.scanscriptlog:
        scanlogfile = arc.common.LogFile(Config.scanscriptlog)
        arc.common.Logger_getRootLogger().addDestination(scanlogfile)
        arc.common.Logger_getRootLogger().setThreshold(Config.log_threshold)

    jobs = get_jobs(ctr_dirs)
    if not jobs:
        return
    if Config.remote_host:
        # NOTE: Assuming 256 B of TCP window needed for each job (squeue)
        ssh_connect(Config.remote_host, Config.remote_user, Config.private_key, (2 << 7) * len(jobs))

    execute = execute_local if not Config.remote_host else execute_remote
    args = Config.slurm_bin_path + "/squeue -a -h -o %i:%T -t all -j " + ",".join(jobs.keys())
    if os.environ.has_key("__SLURM_TEST"):
        handle = execute(args, env=dict(os.environ))
    else:
        handle = execute(args)
    if handle.returncode != 0:
        debug("Got error code %i from squeue" % handle.returncode, "slurm.Scan")
        debug("Error output is:\n" + "".join(handle.stderr), "slurm.Scan")

    # Slurm can report StartTime and EndTime in at least these two formats:
    # 2010-02-15T15:30:29 (MDS)
    # 02/15-15:25:15
    # Python does not support duplicate named groups.
    # Have to use separate regex if we want to use named groups.
    date_MDS = re.compile(r"^(?P<YYYY>\d\d\d\d)-(?P<mm>\d\d)-(?P<dd>\d\d)T(?P<HH>\d\d):(?P<MM>\d\d):(?P<SS>\d\d)$")
    date_2 = re.compile(r"^(?P<mm>\d\d)/(?P<dd>\d\d)-(?P<HH>\d\d):(?P<MM>\d\d):(?P<SS>\d\d)$")

    for line in handle.stdout:
        try:
            localid, state = line.strip().split(":", 1)
        except:
            if line:
                warn("Failed to parse squeue line: " + line, "slurm.Scan")
            continue
        job = jobs[localid]
        job.state = state
        if job.state in RUNNING:
            continue

        if not job.state:
            set_exit_code_from_diag(job)
        job.message = MESSAGES.get(job.state, "")

        args = Config.slurm_bin_path + "/scontrol -o show job %s" % localid
        scontrol_handle = execute(args)
        if scontrol_handle.returncode != 0:
            debug("Got error code %i from scontrol" % scontrol_handle.returncode, "slurm.Scan")
            debug("Error output is:\n" + "".join(scontrol_handle.stderr), "slurm.Scan")

        try:
            scontrol_dict = dict(item.split("=", 1) for item in re.split(" (?=[^ =]+=)", scontrol_handle.stdout[0]))
            job = jobs[scontrol_dict["JobId"]]
        except:
            warn("Failed to parse scontrol line: " + line, "slurm.Scan")
            continue

        if "ExitCode" in scontrol_dict:
            ec1, ec2 = scontrol_dict["ExitCode"].split(":")
            job.exitcode = int(ec2) + 256 if int(ec2) != 0 else int(ec1)
        else:
            job.exitcode = 0 if state == "COMPLETED" else -1

        if (state == "NODE_FAIL" or state == "CANCELLED") and ("ExitCode" not in scontrol_dict or job.exitcode == 0):
            job.exitcode = 15
            job.message = "Job was cancelled by SLURM"

        if "StartTime" in scontrol_dict:
            match = date_MDS.match(scontrol_dict["StartTime"]) or date_2.match(scontrol_dict["StartTime"])
            scontrol_dict["StartTime"] = get_MDS(match.groupdict())
            job.LRMSStartTime = arc.common.Time(scontrol_dict["StartTime"])
        if "EndTime" in scontrol_dict:
            match = date_MDS.match(scontrol_dict["EndTime"]) or date_2.match(scontrol_dict["EndTime"])
            scontrol_dict["EndTime"] = get_MDS(match.groupdict())
            job.LRMSEndTime = arc.common.Time(scontrol_dict["EndTime"])

        if "StartTime" in scontrol_dict and "EndTime" in scontrol_dict:
            job.WallTime = job.LRMSEndTime - job.LRMSStartTime

        if "NumCPUs" in scontrol_dict:
            job.Processors = scontrol_dict["NumCPUs"]

        with open(job.lrms_done_file, "w") as f:
            f.write("%d %s\n" % (job.exitcode, job.message))
        write_comments(job)
        update_diag(job)

    kicklist = [job for job in jobs.itervalues() if job.state not in RUNNING]
    kicklist.extend([job for job in jobs.itervalues() if job.state == "CANCELLED"])  # kick twice
    gm_kick(kicklist)
Example #37
0
def run(args):
    # add lineage profiles/stats

    import re
    from ete2 import PhyloTree, NCBITaxa

    if not args.taxonomy and not args.info:
        args.taxonomy = True

    ncbi = NCBITaxa()

    all_taxids = {}
    all_names = set()
    queries = []

    if not args.search:
        log.error('Search terms should be provided (i.e. --search) ')
        sys.exit(-1)
    for n in args.search:
        queries.append(n)
        try:
            all_taxids[int(n)] = None
        except ValueError:
            all_names.add(n.strip())

    # translate names
    name2tax = ncbi.get_name_translator(all_names)
    all_taxids.update([(v, None) for v in name2tax.values()])

    not_found_names = all_names - set(name2tax.keys())
    if args.fuzzy and not_found_names:
        log.warn("%s unknown names", len(not_found_names))
        for name in not_found_names:
            # enable extension loading
            tax, realname, sim = ncbi.get_fuzzy_name_translation(
                name, args.fuzzy)
            if tax:
                all_taxids[tax] = None
                name2tax[name] = tax
                name2realname[name] = realname
                name2score[name] = "Fuzzy:%0.2f" % sim

    if args.taxonomy:
        log.info("Dumping NCBI taxonomy of %d taxa..." % (len(all_taxids)))
        t = ncbi.get_topology(all_taxids.keys(),
                              intermediate_nodes=args.full_lineage,
                              rank_limit=args.rank_limit,
                              collapse_subspecies=args.collapse_subspecies)

        id2name = ncbi.get_taxid_translator([n.name for n in t.traverse()])
        for n in t.traverse():
            n.add_features(taxid=n.name)
            n.add_features(sci_name=str(id2name.get(int(n.name), "?")))
            n.name = "%s - %s" % (id2name.get(int(n.name), n.name), n.name)
            lineage = ncbi.get_lineage(n.taxid)
            n.add_features(
                named_lineage='|'.join(ncbi.translate_to_names(lineage)))
        dump(t,
             features=[
                 "taxid", "name", "rank", "bgcolor", "sci_name",
                 "collapse_subspecies", "named_lineage"
             ])
    elif args.info:
        print '# ' + '\t'.join(
            ["Taxid", "Sci.Name", "Rank", "Named Lineage", "Taxid Lineage"])
        translator = ncbi.get_taxid_translator(all_taxids)
        ranks = ncbi.get_rank(all_taxids)
        for taxid, name in translator.iteritems():
            lineage = ncbi.get_lineage(taxid)
            named_lineage = ','.join(ncbi.translate_to_names(lineage))
            lineage_string = ','.join(map(str, lineage))
            print '\t'.join([
                str(taxid), name,
                ranks.get(taxid, ''), named_lineage, lineage_string
            ])
Example #38
0
def get_data(force,
             gene,
             pseudogene,
             reverse_complement,
             parser,
             fix_karolinska,
             genome_range,
             gene_ids,
             coordinate,
             patch,
             post_process,
             functional_exceptions,
             unique_regions,
             max_cn,
             custom_url=None):
    def sf(x):
        y = re.split(r'(\d+)', x[len(gene):])
        return int(y[1]), y[2]

    # Get Karolinska's data
    cypdata = karolinska.get_karolinska_database(gene, parser, force,
                                                 custom_url)
    if fix_karolinska is not None:
        fix_karolinska(cypdata)

    #pprint (cypdata)

    # Get NCBI data for genes and reference genome
    genes, hg19 = ncbi.get_genomes(gene_ids[0],
                                   genome_range,
                                   gene_ids[1:],
                                   force=force,
                                   reverse_complement=reverse_complement)

    new_seq = genes[gene].seq.tomutable()
    for c, n in patch:
        new_seq[coordinate(c, genes[gene])] = n
    genes[gene] = genes[gene]._replace(seq=new_seq.toseq())

    # Fix Karolinska's coordinates
    result = merger.merge(cypdata, genes[gene], coordinate,
                          functional_exceptions, reverse_complement)
    ## pprint(genes['CYP21'].translation)
    ## pprint(genes['CYP21P'].translation)

    mx = collections.defaultdict(lambda: ['', []])
    for a in result:
        for m in result[a]['mutations']:
            mx[(m['pos'], m['op'])][0] = m
            mx[(m['pos'], m['op'])][1].append(a)
    for m in genes[gene].pseudo_mutations.values():
        m['functional'] = merger.is_functional(
            genes[gene], m, genes[gene].pseudo_mutations.values(), True)
        # if (m['pos'], m['op']) in mx:
        # 	log.warn('[{}] {} (from {}) originates from {}',
        # 		' F'[mx[(m['pos'], m['op'])][0]['functional']],
        # 		mx[(m['pos'], m['op'])][0]['old'],
        # 		','.join(set(mx[(m['pos'], m['op'])][1])),
        # 		m['old']
        # 	)

    # Remove mutations not present in hg19 and fix the coordinates
    for a in result:
        for m in result[a]['mutations']:
            if m['pos'] == 'pseudogene': continue
            if m['pos'] not in genes[gene].translation:
                log.warn('Main: Translation not found for {}: {} ({})', a,
                         m['old'], m['pos'])
                m['pos'] = None
            else:
                m['pos'] = genes[gene].translation[m['pos']]
        result[a]['mutations'] = [
            m for m in result[a]['mutations'] if not m['pos'] is None
        ]

    # Fetch missing dbSNP links
    result = dbsnp.get_dbsnp(result, genome_range, force)

    # Fix exon and intron coordinates
    for _, g in genes.iteritems():
        g.exons[:] = map(
            lambda x: (g.translation[int(x.start)], g.translation[int(x.end)]),
            g.exons)
        g.introns[:] = map(
            lambda x: (g.translation[int(x.start)], g.translation[int(x.end)]),
            g.introns)

    # Patch hg19 with reference SNPs
    hg19 = list(hg19)
    for gi, hi in genes[gene].translation.iteritems():
        if hg19[hi - genome_range[1]] != genes[gene].seq[gi]:
            hg19[hi - genome_range[1]] = genes[gene].seq[gi]
    hg19 = ''.join(hg19)

    result.update({
        gene + '*1': {
            'mutations': [],
            'phenotype': {
                'invivo': 'Normal',
                'invitro': 'Normal'
            }
        }
    })

    # Add missing regions
    post_process(genes, result)

    hoi = collections.OrderedDict()
    for pos, m in genes[gene].pseudo_translation.iteritems():
        hoi[genes[gene].translation[pos]] = NoIndent(
            (genes[pseudogene].translation[m['old_pos']],
             m['op'] if 'op' in m else ''))
    return dict(
        #map=hoi,
        seq=hg19,
        region=NoIndent(genome_range),
        name=gene,
        exons={
            '{}'.format(ei + 1): NoIndent(e)
            for ei, e in enumerate(genes[gene].exons)
        },
        special_regions={
            g: NoIndent(gg)
            for g, gg in genes[gene].special_regions.iteritems()
        },
        pseudogenes={
            g: {
                'exons': {
                    '{}'.format(ei + 1): NoIndent(e)
                    for ei, e in enumerate(genes[g].exons)
                },
                'special_regions': {
                    g: NoIndent(gg)
                    for g, gg in genes[g].special_regions.iteritems()
                }
            }
            for g in [pseudogene]
        } if pseudogene is not None else {},
        # Regions used for CNV detection of each gene
        unique_regions=NoIndent(unique_regions),
        # Unique CYP2D8 region used for CNV detection
        # Based on BLAT, that is [5e-4i-4e]
        cnv_region=NoIndent(('chr22', 42547463, 42548249)),
        alleles=OrderedDict([(a, {
            'phenotype':
            NoIndent(result[a]['phenotype']),
            'mutations': [
                NoIndent(
                    OrderedDict([(x, y[x]) for x in sorted(y, reverse=True)]))
                for y in result[a]['mutations']
            ]
        }) for a in sorted(result, key=sf)]),
        max_cn=max_cn)
Example #39
0
        sys.exit(1)

    if not args.weight_type in ['prob', 'logprob']:
        log.err("Weight type (-m) must be either 'prob'or 'logprob'.")
        sys.exit(1)

    logprob = (args.weight_type == 'logprob')

    if args.output_type == "forest":
        if not args.output_file:
            log.err(
                "Need to provide '-o FILE_PREFIX' with output type 'forest'.")
            sys.exit(1)
        if args.k:
            log.warn(
                "Ignoring -k command line option because output type is 'forest'."
            )

    if not args.parser in ['td', 'basic']:
        log.err("Parser (-p) must be either 'td' or 'basic'.")
        sys.exit(1)

    if args.parser != 'td' and args.boundary_nodes:
        log.warn(
            'The -bn option is only relevant for the tree decomposition parser ("-p td").'
        )

    if args.k > config.maxk:
        log.err("k must be <= than %i (defined in in args.py)." % args.maxk)
        sys.exit(1)
Example #40
0
        allele_db='../database/V-QUEST-reference-allele-db+no-period-references.clustalw.no-gaps.fasta',
        num_mappings_to_save=5,
        skip_extraction=False):

    mapper.params = '-cx map-pb -k10 -w3 -N{}'.format(num_mappings_to_save - 1)
    try:
        mappings = mapper.run(reads_path, allele_db)
        reads = dict([(x.id, x) for x in SeqIO.parse(reads_path, 'fasta')])
    except IOError, ValueError:
        log.error('Reads file does not exist or is invalid')
        raise ValueError

    unmapped = set(reads.keys()).difference([x.qName for x in mappings])
    if len(unmapped) > 0:
        log.warn(
            '{} reads had no allele mapping, they will be removed\n'.format(
                len(unmapped), '\n'.join(list(unmapped))))
        log.debug('Read ids to be removed:\n' + '\n'.join(unmapped))
        reads = dict([(k, v) for k, v in reads.iteritems()
                      if k not in unmapped])
    log.info('Loaded {} reads'.format(len(reads)))

    if not skip_extraction:
        reads, mappings = get_read_segments(reads, mappings)

    modified_seqs = []
    for m in mappings:
        try:
            reads[m.qName].mapping.append(m)
        except AttributeError:
            reads[m.qName].mapping = [m]