Exemple #1
0
def test_parse_qc_table_fastqc(summary, data, quality_metric_fastqc_schema):
    meta = fastqc_utils.parse_qc_table([summary, data],
                                       url='test_url',
                                       qc_schema=quality_metric_fastqc_schema)
    assert meta['Total Sequences'] == 557747
    assert meta['Kmer Content'] == 'WARN'
    assert meta['url'] == 'test_url'
    assert meta['overall_quality_status'] == 'PASS'
Exemple #2
0
def test_parse_qc_table_pairsqc(pairsqc_summary,
                                quality_metric_pairsqc_schema):
    meta = fastqc_utils.parse_qc_table([pairsqc_summary],
                                       url='test_url',
                                       qc_schema=quality_metric_pairsqc_schema)
    assert meta['Total reads'] == 651962
    assert meta['Cis/Trans ratio'] == 64.141
    assert meta['convergence'] == 'Good'
Exemple #3
0
def _qc_updater(status,
                awsemfile,
                ff_meta,
                tibanna,
                quality_metric='quality_metric_fastqc',
                file_argument='input_fastq',
                report_html=None,
                datafiles=None,
                zipped=True,
                datajson_argument=None,
                other_fields=None):
    if datajson_argument == awsemfile.argument_name:
        return
    # avoid using [] as default argument
    if datafiles is None:
        datafiles = ['summary.txt', 'fastqc_data.txt']
    if status == 'uploading':
        # wait until this bad boy is finished
        return
    # keys
    ff_key = tibanna.ff_keys
    # move files to proper s3 location
    # need to remove sbg from this line
    accession = awsemfile.runner.get_file_accessions(file_argument)[0]
    zipped_report = awsemfile.key
    files_to_parse = datafiles
    if report_html:
        files_to_parse.append(report_html)
    printlog("accession is %s" % accession)
    jsondata = dict()
    if zipped:
        try:
            files = awsemfile.s3.unzip_s3_to_s3(zipped_report,
                                                accession,
                                                files_to_parse,
                                                acl='public-read')
        except Exception as e:
            printlog(tibanna.s3.__dict__)
            raise Exception("%s (key={})\n".format(zipped_report) % e)
        printlog("files : %s" % str(files))
        filedata = [files[_]['data'] for _ in datafiles]
    else:
        if datajson_argument:
            datajson_key = awsemfile.runner.get_file_key(datajson_argument)
            jsondata0 = [
                json.loads(awsemfile.s3.read_s3(_)) for _ in datajson_key
            ]
            for d in jsondata0:
                jsondata.update(d)
        filedata = [awsemfile.s3.read_s3(_) for _ in datafiles]
        reportdata = awsemfile.s3.read_s3(report_html)
        report_html = accession + 'qc_report.html'
        awsemfile.s3.s3_put(reportdata, report_html, acl='public-read')
        qc_url = 'https://s3.amazonaws.com/' + awsemfile.bucket + '/' + report_html
        files = {report_html: {'data': reportdata, 's3key': qc_url}}
    # schema. do not need to check_queue
    qc_schema = ff_utils.get_metadata("profiles/" + quality_metric + ".json",
                                      key=ff_key,
                                      ff_env=tibanna.env)
    # parse fastqc metadata
    if report_html in files:
        qc_url = files[report_html]['s3key']
    else:
        qc_url = None
    meta = parse_qc_table(filedata,
                          qc_schema=qc_schema.get('properties'),
                          url=qc_url)
    if jsondata:
        meta.update(jsondata)
    # custom fields
    if other_fields:
        for field in other_fields:
            meta.update(other_fields)
    printlog("qc meta is %s" % meta)
    # post fastq metadata
    qc_meta = ff_utils.post_metadata(meta, quality_metric, key=ff_key)
    if qc_meta.get('@graph'):
        qc_meta = qc_meta['@graph'][0]
    printlog("qc_meta is %s" % qc_meta)
    # update original file as well
    try:
        original_file = ff_utils.get_metadata(accession,
                                              key=ff_key,
                                              ff_env=tibanna.env,
                                              add_on='frame=object',
                                              check_queue=True)
        printlog("original_file is %s" % original_file)
    except Exception as e:
        raise Exception(
            "Couldn't get metadata for accession {} : ".format(accession) +
            str(e))
    patch_file = {'quality_metric': qc_meta['@id']}
    try:
        ff_utils.patch_metadata(patch_file, original_file['uuid'], key=ff_key)
    except Exception as e:
        raise Exception("patch_metadata failed in fastqc_updater." + str(e) +
                        "original_file ={}\n".format(str(original_file)))
    # patch the workflow run, value_qc is used to make drawing graphs easier.
    output_files = ff_meta.output_files
    output_files[0]['value_qc'] = qc_meta['@id']
    retval = {'output_files': output_files}
    printlog("retval is %s" % retval)
    return retval
Exemple #4
0
def _qc_updater(status,
                awsemfile,
                ff_meta,
                tibanna,
                quality_metric='quality_metric_fastqc',
                file_argument='input_fastq',
                report_html=None,
                datafiles=None):
    # avoid using [] as default argument
    if datafiles is None:
        datafiles = ['summary.txt', 'fastqc_data.txt']
    if status == 'uploading':
        # wait until this bad boy is finished
        return
    # keys
    ff_key = tibanna.ff_keys
    # move files to proper s3 location
    # need to remove sbg from this line
    accession = awsemfile.runner.all_file_accessions[file_argument]
    zipped_report = awsemfile.key
    files_to_parse = datafiles
    if report_html:
        files_to_parse.append(report_html)
    LOG.info("accession is %s" % accession)
    try:
        files = awsemfile.s3.unzip_s3_to_s3(zipped_report,
                                            accession,
                                            files_to_parse,
                                            acl='public-read')
    except Exception as e:
        LOG.info(tibanna.s3.__dict__)
        raise Exception("%s (key={})\n".format(zipped_report) % e)
    # schema. do not need to check_queue
    qc_schema = ff_utils.get_metadata("profiles/" + quality_metric + ".json",
                                      key=ff_key,
                                      ff_env=tibanna.env)
    # parse fastqc metadata
    LOG.info("files : %s" % str(files))
    filedata = [files[_]['data'] for _ in datafiles]
    if report_html in files:
        qc_url = files[report_html]['s3key']
    else:
        qc_url = None
    meta = parse_qc_table(filedata,
                          qc_schema=qc_schema.get('properties'),
                          url=qc_url)
    LOG.info("qc meta is %s" % meta)
    # post fastq metadata
    qc_meta = ff_utils.post_metadata(meta, quality_metric, key=ff_key)
    if qc_meta.get('@graph'):
        qc_meta = qc_meta['@graph'][0]
    LOG.info("qc_meta is %s" % qc_meta)
    # update original file as well
    try:
        original_file = ff_utils.get_metadata(accession,
                                              key=ff_key,
                                              ff_env=tibanna.env,
                                              add_on='frame=object',
                                              check_queue=True)
        LOG.info("original_file is %s" % original_file)
    except Exception as e:
        raise Exception(
            "Couldn't get metadata for accession {} : ".format(accession) +
            str(e))
    patch_file = {'quality_metric': qc_meta['@id']}
    try:
        ff_utils.patch_metadata(patch_file, original_file['uuid'], key=ff_key)
    except Exception as e:
        raise Exception("patch_metadata failed in fastqc_updater." + str(e) +
                        "original_file ={}\n".format(str(original_file)))
    # patch the workflow run, value_qc is used to make drawing graphs easier.
    output_files = ff_meta.output_files
    output_files[0]['value_qc'] = qc_meta['@id']
    retval = {
        "output_quality_metrics": [{
            "name": quality_metric,
            "value": qc_meta['@id']
        }],
        'output_files':
        output_files
    }
    LOG.info("retval is %s" % retval)
    return retval