Beispiel #1
0
def dispatch_request(ip, request):
    method = REQUEST_TABLE[request['request']]
    spec = inspect.getargspec(method)
    
    # figure out what args are required
    if spec[3] is not None:
        required_args = set(spec[0][:-len(spec[3])]) # set of args with no default value
    else:
        required_args = set(spec[0]) # all args are required

    # remove compulsory stuff from request dict, add IP
    del request['version']
    del request['api_key']
    del request['nonce']
    del request['signature']
    del request['timestamp']
    del request['request']
    request['ip'] = ip

    # check required args are provided and we haven't been sent superfluous args
    keys = set(request.keys())
    if not keys.issubset(spec[0]):
        raise errors.InvalidArgumentError() # some request args aren't in method args
    elif not required_args.issubset(keys):
        raise errors.InvalidArgumentError() # some required method args aren't in request args

    return method(**request) # call into target method
Beispiel #2
0
def market_depth(ip, step=0.05):
    try:
        step = Decimal(str(step))
    except:
        raise errors.InvalidArgumentError()

    if step <= 0:
        raise errors.InvalidArgumentError()

    cursor_bid = run_market_depth_query(step, 'bid')
    cursor_ask = run_market_depth_query(step, 'ask')

    return {
        'bid': compile_market_depth_results(cursor_bid, step),
        'ask': compile_market_depth_results(cursor_ask, step)
    }
Beispiel #3
0
def run_market_depth_query(step, column):
    if column == 'bid':
        offer_currency = 'GBP'
    elif column == 'ask':
        offer_currency = 'BTC'
    else:
        raise errors.InvalidArgumentError()

    query = """SELECT
        CAST(txdb_order.%s / %s AS UNSIGNED INTEGER) * %s as 'price',
        SUM(txdb_order.want_amount) as 'volume'
    FROM txdb_order
    INNER JOIN txdb_balance
        ON txdb_order.balance_id = txdb_balance.id
    INNER JOIN txdb_currency
        ON txdb_balance.currency_id = txdb_currency.id
    WHERE txdb_order.filled = 0 AND txdb_order.cancelled = 0
        AND txdb_currency.code = %s
    GROUP BY CAST(txdb_order.%s / %s AS UNSIGNED INTEGER)
    ORDER BY price ASC""" % (column, '%s', '%s', '%s', column, '%s')

    cursor = connection.cursor()
    cursor.execute(query, [step, step, offer_currency, step])
    return cursor
Beispiel #4
0
def one_task_per_interval(
        interval_count,
        validate_task_output,
        if_sequence=0,
        and_end_task=True,
        reuse_tasks=True,
        interval_list_param="interval_list",
        oldest_git_commit_to_reuse='6ca726fc265f9e55765bf1fdf71b86285b8a0ff2',
        task_key_params=['name', 'inputs', 'interval', 'ref'],
        script=arvados.current_job()['script']):
    """
    Queue one task for each of interval_count intervals, splitting
    the genome chunk (described by the .interval_list file) evenly.

    Each new task will have an "inputs" parameter: a manifest
    containing a set of one or more gVCF files and its corresponding
    index.

    Each new task will also have a "ref" parameter: a manifest
    containing the reference files to use.

    Note that all gVCFs not matching the group_by_regex are ignored.

    if_sequence and and_end_task arguments have the same significance
    as in arvados.job_setup.one_task_per_input_file().
    """
    if if_sequence != arvados.current_task()['sequence']:
        return

    interval_list_file = gatk_helper.mount_single_gatk_interval_list_input(
        interval_list_param=interval_list_param)

    interval_reader = open(interval_list_file, mode="r")

    lines = interval_reader.readlines()
    sn_intervals = dict()
    sns = []
    total_len = 0
    for line in lines:
        if line[0] == '@':
            # skip all lines starting with '@'
            continue
        fields = line.split("\t")
        if len(fields) != 5:
            raise errors.InvalidArgumentError(
                "interval_list %s has invalid line [%s] - expected 5 fields but got %s"
                % (interval_list_file, line, len(fields)))
        sn = fields[0]
        start = int(fields[1])
        end = int(fields[2])
        length = int(end) - int(start) + 1
        total_len += int(length)
        sn_intervals[sn] = (start, end)
        sns.append(sn)

    print "Total chunk length is %s" % total_len
    interval_len = int(total_len / interval_count)
    intervals = []
    print "Splitting chunk into %s intervals of size ~%s" % (interval_count,
                                                             interval_len)
    for interval_i in range(0, interval_count):
        interval_num = interval_i + 1
        intervals_count = 0
        remaining_len = interval_len
        interval = []
        while len(sns) > 0:
            sn = sns.pop(0)
            if not sn_intervals.has_key(sn):
                raise errors.ValueError(
                    "sn_intervals missing entry for sn [%s]" % sn)
            start, end = sn_intervals[sn]
            if (end - start + 1) > remaining_len:
                # not enough space for the whole sq, split it
                real_end = end
                end = remaining_len + start - 1
                assert ((end - start + 1) <= remaining_len)
                sn_intervals[sn] = (end + 1, real_end)
                sns.insert(0, sn)
            interval.append("%s:%s-%s" % (sn, start, end))
            remaining_len -= (end - start + 1)
            intervals_count += 1
            if remaining_len <= 0:
                break
        if intervals_count > 0:
            intervals.append(interval)
        else:
            print "WARNING: skipping empty intervals for %s" % interval_input_name
    print "Have %s intervals" % (len(intervals))

    if reuse_tasks:
        # get candidates for task reuse
        job_filters = [
            ['script', '=', script],
            ['repository', '=',
             arvados.current_job()['repository']],
            ['script_version', 'in git', oldest_git_commit_to_reuse],
            [
                'docker_image_locator', 'in docker',
                arvados.current_job()['docker_image_locator']
            ],
        ]
        reusable_tasks = get_reusable_tasks(if_sequence + 1, task_key_params,
                                            job_filters)
        print "Have %s potentially reusable tasks" % (len(reusable_tasks))

    for interval in intervals:
        interval_str = ' '.join(interval)
        print "Creating task to process interval: [%s]" % interval_str
        new_task_params = arvados.current_task()['parameters']
        new_task_params['interval'] = interval_str
        if reuse_tasks:
            task = create_or_reuse_task(if_sequence + 1, new_task_params,
                                        reusable_tasks, task_key_params,
                                        validate_task_output)
        else:
            task = create_task(if_sequence + 1, new_task_params)

    if and_end_task:
        print "Ending task %s successfully" % if_sequence
        arvados.api().job_tasks().update(uuid=arvados.current_task()['uuid'],
                                         body={
                                             'success': True
                                         }).execute()
        exit(0)
Beispiel #5
0
def one_task_per_group_combined_inputs(ref_input,
                                       job_input,
                                       interval_lists,
                                       group_by_regex,
                                       if_sequence=0,
                                       and_end_task=True,
                                       create_task_func=create_task):
    """
    Queue one task for each group of gVCFs and corresponding interval_list
    in the inputs_collection, with grouping based on three things:
      - the stream in which the gVCFs are held within the collection
      - the value of the named capture group "group_by" in the
        group_by_regex against the filename in the inputs_collection

    Each new task will have an "inputs" parameter: a manifest
    containing a set of one or more gVCF files and its corresponding
    index.

    Each new task will also have a "ref" parameter: a manifest
    containing the reference files to use.

    Note that all gVCFs not matching the group_by_regex are ignored.

    if_sequence and and_end_task arguments have the same significance
    as in arvados.job_setup.one_task_per_input_file().
    """
    if if_sequence != arvados.current_task()['sequence']:
        return

    group_by_r = re.compile(group_by_regex)

    # prepare interval_lists
    il_cr = arvados.CollectionReader(interval_lists)
    il_ignored_files = []
    interval_list_by_group = {}
    for s in il_cr.all_streams():
        for f in s.all_files():
            m = re.search(group_by_r, f.name())
            if m:
                group_name = m.group('group_by')
                interval_list_m = re.search(r'\.interval_list', f.name())
                if interval_list_m:
                    if group_name not in interval_list_by_group:
                        interval_list_by_group[group_name] = dict()
                    interval_list_by_group[group_name][s.name(), f.name()] = f
                    continue
            # if we make it this far, we have files that we are ignoring
            il_ignored_files.append("%s/%s" % (s.name(), f.name()))

    # prepare gVCF input collections
    cr = arvados.CollectionReader(job_input)
    ignored_files = []
    last_stream_name = ""
    gvcf_by_group = {}
    gvcf_indices = {}
    for s in sorted(cr.all_streams(), key=lambda stream: stream.name()):
        stream_name = s.name()
        # handle each stream name separately
        if stream_name != last_stream_name:
            if last_stream_name != "":
                print "Done processing files in stream %s" % last_stream_name
                one_task_per_gvcf_group_in_stream_combined_inputs(
                    last_stream_name,
                    gvcf_by_group,
                    gvcf_indices,
                    interval_list_by_group,
                    if_sequence,
                    ref_input,
                    create_task_func=create_task_func)
                # now that we are done with last_stream_name, reinitialise dicts to
                # process data from new stream
                print "Processing files in stream %s" % stream_name
                gvcf_by_group = {}
                gvcf_indices = {}
            last_stream_name = stream_name

        # loop over all the files in this stream (there may be only one)
        for f in s.all_files():
            if re.search(r'\.tbi$', f.name()):
                gvcf_indices[s.name(), f.name()] = f
                continue
            m = re.search(group_by_r, f.name())
            if m:
                group_name = m.group('group_by')
                gvcf_m = re.search(r'\.vcf\.gz$', f.name())
                if gvcf_m:
                    if group_name not in gvcf_by_group:
                        gvcf_by_group[group_name] = dict()
                    gvcf_by_group[group_name][s.name(), f.name()] = f
                    continue
                interval_list_m = re.search(r'\.interval_list', f.name())
                if interval_list_m:
                    if group_name not in interval_list_by_group:
                        interval_list_by_group[group_name] = dict()
                    if (s.name(),
                            f.name()) in interval_list_by_group[group_name]:
                        if interval_list_by_group[group_name][s.name(
                        ), f.name()].as_manifest() != f.as_manifest():
                            raise errors.InvalidArgumentError(
                                "Already have interval_list for group %s file %s/%s, but manifests are not identical!"
                                % (group_name, s.name(), f.name()))
                    else:
                        interval_list_by_group[group_name][s.name(),
                                                           f.name()] = f
                    continue
            # if we make it this far, we have files that we are ignoring
            ignored_files.append("%s/%s" % (s.name(), f.name()))
    # finally, process the last stream
    print "Processing last stream"
    one_task_per_gvcf_group_in_stream_combined_inputs(
        stream_name,
        gvcf_by_group,
        gvcf_indices,
        interval_list_by_group,
        if_sequence,
        ref_input,
        create_task_func=create_task_func)

    # report on any ignored files
    if len(ignored_files) > 0:
        print "WARNING: ignored non-matching files in inputs_collection: %s" % (
            ' '.join(ignored_files))
        # TODO: could use `setmedian` from https://github.com/ztane/python-Levenshtein
        # to print most representative "median" filename (i.e. skipped 15 files like median), then compare the
        # rest of the files to that median (perhaps with `ratio`)

    if and_end_task:
        print "Ending task %s successfully" % if_sequence
        arvados.api().job_tasks().update(uuid=arvados.current_task()['uuid'],
                                         body={
                                             'success': True
                                         }).execute()
        exit(0)
Beispiel #6
0
def chunked_tasks_per_cram_file(
        ref_input,
        job_input,
        interval_lists,
        validate_task_output,
        if_sequence=0,
        and_end_task=True,
        reuse_tasks=True,
        reuse_tasks_retrieve_all=True,
        interval_list_param="interval_list",
        oldest_git_commit_to_reuse='6ca726fc265f9e55765bf1fdf71b86285b8a0ff2',
        script=arvados.current_job()['script']):
    """
    Queue one task for each cram file in this job's input collection.
    Each new task will have an "input" parameter: a manifest
    containing one .cram file and its corresponding .crai index file.
    Files in the input collection that are not named *.cram or *.crai
    (as well as *.crai files that do not match any .cram file present)
    are silently ignored.
    if_sequence and and_end_task arguments have the same significance
    as in arvados.job_setup.one_task_per_input_file().
    """
    if if_sequence != arvados.current_task()['sequence']:
        return

    # prepare interval lists
    cr = arvados.CollectionReader(interval_lists)
    chunk_interval_list = {}
    chunk_input_pdh_names = []
    for s in cr.all_streams():
        for f in s.all_files():
            if re.search(r'\.interval_list$', f.name()):
                chunk_interval_list[s.name(), f.name()] = f
    for ((s_name, f_name),
         chunk_interval_list_f) in sorted(chunk_interval_list.items()):
        chunk_input = chunk_interval_list_f.as_manifest()
        try:
            r = arvados.api().collections().create(body={
                "manifest_text": chunk_input
            }).execute()
            chunk_input_pdh = r["portable_data_hash"]
            chunk_input_name = os.path.join(s_name, f_name)
            chunk_input_pdh_names.append((chunk_input_pdh, chunk_input_name))
        except:
            raise

    if len(chunk_input_pdh_names) == 0:
        raise errors.InvalidArgumentError(
            "No interval_list files found in %s" % (interval_lists))

    # prepare CRAM input collections
    cr = arvados.CollectionReader(job_input)
    cram = {}
    crai = {}
    for s in cr.all_streams():
        for f in s.all_files():
            if re.search(r'\.cram$', f.name()):
                cram[s.name(), f.name()] = f
            elif re.search(r'\.crai$', f.name()):
                crai[s.name(), f.name()] = f
    for ((s_name, f_name), cram_f) in cram.items():
        crai_f = crai.get(
            (s_name, re.sub(r'cram$', 'crai', f_name)),
            crai.get((s_name, re.sub(r'cram$', 'cram.crai', f_name)), None))
        task_input = cram_f.as_manifest()
        if crai_f:
            task_input += crai_f.as_manifest()
        else:
            # no CRAI for CRAM
            raise errors.InvalidArgumentError(
                "No correponding CRAI file found for CRAM file %s" % f_name)

        # Create a portable data hash for the task's subcollection
        try:
            r = arvados.api().collections().create(body={
                "manifest_text": task_input
            }).execute()
            task_input_pdh = r["portable_data_hash"]
        except:
            raise

        if reuse_tasks:
            task_key_params = ['input', 'ref', 'chunk']
            # get candidates for task reuse
            job_filters = [
                ['script', '=', script],
                ['repository', '=',
                 arvados.current_job()['repository']],
                ['script_version', 'in git', oldest_git_commit_to_reuse],
                [
                    'docker_image_locator', 'in docker',
                    arvados.current_job()['docker_image_locator']
                ],
            ]
            if reuse_tasks_retrieve_all:
                # retrieve a full set of all possible reusable tasks
                reusable_tasks = get_reusable_tasks(if_sequence + 1,
                                                    task_key_params,
                                                    job_filters)
                print "Have %s tasks for potential reuse" % (
                    len(reusable_tasks))
            else:
                reusable_task_jobs = get_jobs_for_task_reuse(job_filters)
                print "Have %s jobs for potential task reuse" % (
                    len(reusable_task_jobs))
                reusable_task_job_uuids = [
                    job['uuid'] for job in reusable_task_jobs['items']
                ]

        for chunk_input_pdh, chunk_input_name in chunk_input_pdh_names:
            # Create task for each CRAM / chunk
            new_task_params = {
                'input': task_input_pdh,
                'ref': ref_input,
                'chunk': chunk_input_pdh
            }
            print "Creating new task to process %s with chunk interval %s " % (
                f_name, chunk_input_name)
            if reuse_tasks:
                if reuse_tasks_retrieve_all:
                    task = create_or_reuse_task(if_sequence + 1,
                                                new_task_params,
                                                reusable_tasks,
                                                task_key_params,
                                                validate_task_output)
                else:
                    task = create_or_reuse_task_from_jobs(
                        if_sequence + 1, new_task_params,
                        reusable_task_job_uuids, task_key_params,
                        validate_task_output)
            else:
                task = create_task(if_sequence + 1, new_task_params)

    if and_end_task:
        print "Ending task 0 successfully"
        arvados.api().job_tasks().update(uuid=arvados.current_task()['uuid'],
                                         body={
                                             'success': True
                                         }).execute()
        exit(0)
Beispiel #7
0
def one_task_per_gvcf_group_in_stream_combined_inputs(
        stream_name,
        gvcf_by_group,
        gvcf_indices,
        interval_list_by_group,
        if_sequence,
        ref_input_pdh,
        create_task_func=create_task):
    """
    Process one stream of data and launch a subtask for handling it
    """
    print "Finalising stream %s" % stream_name
    for group_name in sorted(gvcf_by_group.keys()):
        print "Have %s gVCFs in group %s" % (len(
            gvcf_by_group[group_name]), group_name)
        # require interval_list for this group
        if group_name not in interval_list_by_group:
            raise errors.InvalidArgumentError(
                "Inputs collection did not contain interval_list for group %s"
                % group_name)
        interval_lists = interval_list_by_group[group_name].keys()
        if len(interval_lists) > 1:
            raise errors.InvalidArgumentError(
                "Inputs collection contained more than one interval_list for group %s: %s"
                % (group_name, ' '.join(interval_lists)))
        interval_list_manifest = interval_list_by_group[group_name].get(
            interval_lists[0]).as_manifest()

        # "combined_inputs" style is to have interval_list and inputs in same collection
        task_inputs_manifest = interval_list_manifest
        for ((s_name, gvcf_name), gvcf_f) in gvcf_by_group[group_name].items():
            task_inputs_manifest += gvcf_f.as_manifest()
            gvcf_index_f = gvcf_indices.get(
                (s_name, re.sub(r'vcf.gz$', 'vcf.tbi', gvcf_name)),
                gvcf_indices.get(
                    (s_name, re.sub(r'vcf.gz$', 'vcf.gz.tbi', gvcf_name)),
                    None))
            if gvcf_index_f:
                task_inputs_manifest += gvcf_index_f.as_manifest()
            else:
                # no index for gVCF - TODO: should this be an error or warning?
                print "WARNING: No correponding .tbi index file found for gVCF file %s" % gvcf_name
                #raise errors.InvalidArgumentError("No correponding .tbi index file found for gVCF file %s" % gvcf_name)

        # Create a portable data hash for the task's subcollection
        try:
            r = arvados.api().collections().create(
                body={
                    "manifest_text": task_inputs_manifest
                }).execute()
            task_inputs_pdh = r["portable_data_hash"]
        except:
            raise

        # Create task to process this group
        name_components = []
        if len(stream_name) > 0 and stream_name != ".":
            name_components.append(stream_name)
        if len(group_name) > 0:
            name_components.append(group_name)
        if len(name_components) == 0:
            name = "all"
        else:
            name = '::'.join(name_components)

        print "Creating task to process %s" % name
        new_task_params = {
            'inputs': task_inputs_pdh,
            'ref': ref_input_pdh,
            'name': name
        }
        task = create_task_func(if_sequence + 1, new_task_params)
Beispiel #8
0
def historical_prices(ip, start_date=None, end_date=None, scope='daily'):
    try:
        start_date = int(start_date) if start_date is not None else None
        end_date = int(end_date) if end_date is not None else None
    except:
        raise errors.InvalidArgumentError()

    # are both valid unix timestamps?
    if start_date is not None and (start_date < 0 or start_date > 0x7FFFFFFF):
        raise errors.InvalidArgumentError()
    elif end_date is not None and (end_date < 0 or end_date > 0x7FFFFFFF or end_date <= start_date):
        raise errors.InvalidArgumentError()

    # adjust scope of query
    if scope == 'monthly':
        group_by = 'YEAR(txdb_transaction.executed), MONTH(txdb_transaction.executed)'
    elif scope == 'weekly':
        group_by = 'YEARWEEK(txdb_transaction.executed)'
    elif scope == 'hourly':
        group_by = 'DATE(txdb_transaction.executed), HOUR(txdb_transaction.executed)'
    elif scope == '15mins':
        group_by = 'DATE(txdb_transaction.executed), HOUR(txdb_transaction.executed), CAST(MINUTE(txdb_transaction.executed) / 15 AS INTEGER)'
    else: # scope == 'daily'
        group_by = 'DATE(txdb_transaction.executed)'

    # construct where clause limiting date range
    date_limits = ""
    date_params = []
    if start_date:
        date_limits += " AND txdb_transaction.executed >= FROM_UNIXTIME(%s)"
        date_params.append(start_date)
    if end_date:
        date_limits += " AND txdb_transaction.executed <= FROM_UNIXTIME(%s)"
        date_params.append(end_date)

    # abusing MySQL since 2011
    query = """SELECT
        txdb_transaction.executed as 'timestamp',
        SUBSTRING_INDEX(GROUP_CONCAT(txdb_order.bid ORDER BY txdb_transaction.executed ASC), ',', 1) AS 'open',
        SUBSTRING_INDEX(GROUP_CONCAT(txdb_order.bid ORDER BY txdb_transaction.executed DESC), ',', 1) AS 'close',
        SUBSTRING_INDEX(GROUP_CONCAT(txdb_order.bid ORDER BY txdb_order.bid ASC), ',', 1) AS 'low',
        SUBSTRING_INDEX(GROUP_CONCAT(txdb_order.bid ORDER BY txdb_order.bid DESC), ',', 1) AS 'high',
        AVG(txdb_order.bid) as 'mean',
        SUM(linked_transaction.amount) as 'volume'
    FROM txdb_transaction
    INNER JOIN txdb_balance from_balance
        ON txdb_transaction.from_balance_id = from_balance.id
    INNER JOIN txdb_currency from_currency
        ON from_balance.currency_id = from_currency.id
    INNER JOIN txdb_transaction linked_transaction
        ON txdb_transaction.linked_transaction_id = linked_transaction.id
    INNER JOIN txdb_order
        ON txdb_transaction.order_id = txdb_order.id
    WHERE txdb_transaction.reversed = 0 AND from_currency.code = 'GBP'
        %s
    GROUP BY %s
    ORDER BY txdb_transaction.executed ASC
    LIMIT %d""" % (date_limits, group_by, settings.HISTORICAL_PRICES_ROW_LIMIT)

    cursor = connection.cursor()
    cursor.execute(query, date_params)

    results = {
        'timestamp': [],
        'open': [],
        'close': [],
        'low': [],
        'high': [],
        'mean': [],
        'volume': []
    }

    # format the results into something usable by ChartDirector
    row = cursor.fetchone()
    while row:
        ts = calendar.timegm(floor_datetime(row[0], scope).timetuple())
        results['timestamp'].append(ts)
        results['open'].append(str(row[1]))
        results['close'].append(str(row[2]))
        results['low'].append(str(row[3]))
        results['high'].append(str(row[4]))
        results['mean'].append(str(row[5]))
        results['volume'].append(str(row[6]))
        row = cursor.fetchone()

    return results