Exemple #1
0
def load_effects(manifest_path, database, **kwargs):
    manifest = ManifestReader(manifest_path)

    vcf_info = manifest.section('vcf')

    # No data regarding VCF
    if 'file' not in vcf_info:
        return

    cursor = connections[database].cursor()

    vcf_path = os.path.join(os.path.dirname(manifest_path), vcf_info['file'])

    with open(vcf_path) as fin:
        log.debug("opening {0} in {1} load_effects".format(vcf_path, __name__))
        stream = EffectStream(fin)
        columns = stream.output_columns
        db_table = VariantEffect._meta.db_table
        pgcopy_batch(stream, db_table, columns, cursor, database)
Exemple #2
0
def load_effects(manifest_path, database, **kwargs):
    manifest = ManifestReader(manifest_path)

    vcf_info = manifest.section('vcf')

    # No data regarding VCF
    if 'file' not in vcf_info:
        return

    cursor = connections[database].cursor()

    vcf_path = os.path.join(os.path.dirname(manifest_path), vcf_info['file'])

    with open(vcf_path) as fin:
        log.debug("opening {0} in {1} load_effects".format(vcf_path, __name__))
        stream = EffectStream(fin)
        columns = stream.output_columns
        db_table = VariantEffect._meta.db_table
        pgcopy_batch(stream, db_table, columns, cursor, database)
Exemple #3
0
def load_variants(manifest_path, database, **kwargs):
    "Variant loading requires only a VCF file and will never load a duplicate."
    manifest = ManifestReader(manifest_path)

    vcf_info = manifest.section('vcf')

    # No data regarding VCF
    if 'file' not in vcf_info:
        return

    cursor = connections[database].cursor()

    vcf_path = os.path.join(os.path.dirname(manifest_path), vcf_info['file'])

    with open(vcf_path) as fin:
        log.debug("opening {0} in {1}".format(vcf_path, __name__))
        stream = VariantStream(fin)
        columns = stream.output_columns
        db_table = Variant._meta.db_table
        pgcopy_batch(stream, db_table, columns, cursor, database)

    VARIANT_CHANNEL.publish(manifest_path=manifest_path, database=database)
Exemple #4
0
def load_variants(manifest_path, database, **kwargs):
    "Variant loading requires only a VCF file and will never load a duplicate."
    manifest = ManifestReader(manifest_path)

    vcf_info = manifest.section('vcf')

    # No data regarding VCF
    if 'file' not in vcf_info:
        return

    cursor = connections[database].cursor()

    vcf_path = os.path.join(os.path.dirname(manifest_path), vcf_info['file'])

    with open(vcf_path) as fin:
        log.debug("opening {0} in {1}".format(vcf_path, __name__))
        stream = VariantStream(fin)
        columns = stream.output_columns
        db_table = Variant._meta.db_table
        pgcopy_batch(stream, db_table, columns, cursor, database)

    VARIANT_CHANNEL.publish(manifest_path=manifest_path, database=database)
Exemple #5
0
def load_samples(manifest_path, database, **kwargs):
    manifest = ManifestReader(manifest_path)

    # Ensure the sample is marked to be loaded..
    if not manifest.marked_for_load():
        log.info('Sample not marked for load', extra={
            'manifest_path': manifest_path,
        })
        return

    # Ensure the sample section is valid..
    if not check_sample_section(manifest):
        log.info('Manifest sample section is not valid', extra={
            'manifest_path': manifest_path,
        })
        return

    # [sample]
    # project = PCGC
    # batch = OTHER
    # sample = 1-03131
    # version = 1

    sample_info = manifest.section('sample')
    vcf_info = manifest.section('vcf')

    # ignore whatever sample is listed in the manifest and scan the vcf
    # for samples
    vcf_path = os.path.join(os.path.dirname(manifest_path), vcf_info['file'])

    with open(vcf_path) as file_obj:
        log.debug("opening {0} in load_samples".format(vcf_path))
        reader = vcf.Reader(file_obj)
        samples = reader.samples

    if 'sample' in sample_info:
        pretty_names = sample_info['sample'].split(',')
    else:
        pretty_names = samples

    if len(samples) != len(pretty_names):
        log.info('Length of comma-delimited samples field in manifest '
                 'does not match the length of samples in {0}'
                 .format(vcf_info['file']))
        return

    # Create the sample (and batch and project if needed)..
    num_created = 0
    num_skipped = 0

    for pretty_name, vcf_sample in zip(pretty_names, samples):
        log.debug('Trying to create {0} sample record'.format(vcf_sample))
        sample, created = create_sample(sample_name=pretty_name,
                                        vcf_colname=vcf_sample,
                                        batch_name=sample_info['batch'],
                                        project_name=sample_info['project'],
                                        version=sample_info['version'])
        log.debug('{0} created'.format(sample))

        if created:
            num_created += 1
            sts.transition(sample, 'Sample Record Created')
        else:
            num_skipped += 1

        manifest = SampleManifest.objects.filter(sample=sample)
        # Create a manfiest object for the sample if one does not exist
        if created or not manifest.exists():
            sample_manifest = SampleManifest(sample=sample)
            sample_manifest.load_content(manifest_path)
            sample_manifest.save()
            sts.transition(sample, 'Sample Manifest Created')

    # Publish to channel that this manifest is eligible for processing
    # downstream..
    if num_created > 0 or kwargs.get('force', False):
        SAMPLE_CHANNEL.publish(manifest_path=manifest_path, database=database)

    # Returns whether the sample has been created
    load_dict = {'created': num_created, 'skipped': num_skipped}
    return load_dict
Exemple #6
0
def load_results(manifest_path, database, **kwargs):
    manifest = ManifestReader(manifest_path)

    if not manifest.marked_for_load():
        log.info('Sample not marked for load', extra={
            'manifest_path': manifest_path,
        })
        return

    # Ensure the sample section is valid..
    if not check_sample_section(manifest):
        log.info('Manifest sample section is not valid', extra={
            'manifest_path': manifest_path,
        })
        return

    sample_info = manifest.section('sample')
    vcf_info = manifest.section('vcf')
    # Ignore whatever sample is listed in the manifest and scan the vcf for
    # samples.
    vcf_path = os.path.join(os.path.dirname(manifest_path), vcf_info['file'])
    with open(vcf_path) as file_obj:
        log.debug("opening {0} in load_samples".format(vcf_path))
        reader = vcf.Reader(file_obj)
        samples = reader.samples
    if 'sample' in sample_info:
        pretty_names = sample_info['sample'].split(',')
    else:
        pretty_names = samples

    for pretty_name, vcf_sample in zip(pretty_names, samples):
        try:
            sample = Sample.objects.get(
                name__iexact=pretty_name,
                batch__name__iexact=sample_info['batch'],
                project__name__iexact=sample_info['project'],
                version=sample_info['version'])
        except Sample.DoesNotExist:
            log.error('Sample does not exist', extra=sample_info)
            return

        #is it already loaded, let's skip for now
        if Result.objects.filter(sample=sample).exists():
            log.debug('{0} exists in results'.format(vcf_sample))
        else:
            log.debug('about to load results for {0}'.format(vcf_sample))

            #STSError: Cannot start transition while already in one.
            successful = False
            while not successful:
                try:
                    with transition(sample, 'Sample Published',
                                    event='Loading Results'):
                        connection = connections[database]
                        cursor = connection.cursor()

                        with open(vcf_path) as fin:
                            stream = ResultStream(fin, sample_id=sample.id,
                                                  vcf_sample=vcf_sample)
                            columns = stream.output_columns
                            db_table = Result._meta.db_table
                            pgcopy_batch(stream, db_table, columns, cursor,
                                         database)

                        # Update result count
                        sample.count = sample.results.count()
                        sample.published = True
                        sample.save()
                        successful = True
                except:
                    log.error('STS errors')
                    time.sleep(10)

    vcf_info = manifest.section('vcf')

    # Absolute path relative to the MANIFEST directory
    vcf_path = os.path.join(os.path.dirname(manifest_path), vcf_info['file'])

    # Compare expected MD5 (in manifest) to the file MD5
    if 'md5' in vcf_info:
        vcf_md5 = checks.file_md5(vcf_path)

        if vcf_md5 != vcf_info['md5']:
            log.error('VCF file MD5 does not match expected in manifest',
                      extra={'manifest_path': manifest_path})

    # Existing samples by the same name of a previous version are unpublished
    # now that is is ready to be published.
    count = Sample.objects.filter(
        name__iexact=sample.name, project=sample.project, batch=sample.batch,
        version__lt=sample.version).update(published=False)

    if count:
        log.info('{0} previous versions unpublished for {1}'
                 .format(count, sample))
Exemple #7
0
def load_samples(manifest_path, database, **kwargs):
    manifest = ManifestReader(manifest_path)

    # Ensure the sample is marked to be loaded..
    if not manifest.marked_for_load():
        log.info('Sample not marked for load',
                 extra={
                     'manifest_path': manifest_path,
                 })
        return

    # Ensure the sample section is valid..
    if not check_sample_section(manifest):
        log.info('Manifest sample section is not valid',
                 extra={
                     'manifest_path': manifest_path,
                 })
        return

    # [sample]
    # project = PCGC
    # batch = OTHER
    # sample = 1-03131
    # version = 1

    sample_info = manifest.section('sample')
    vcf_info = manifest.section('vcf')
    # ignore whatever sample is listed in the manifest and scan the vcf for samples
    vcf_path = os.path.join(os.path.dirname(manifest_path), vcf_info['file'])
    with open(vcf_path) as file_obj:
        log.debug("opening {0} in load_samples".format(vcf_path))
        reader = vcf.Reader(file_obj, preserve_order=False)
        samples = reader.samples
    if 'sample' in sample_info:
        pretty_names = sample_info['sample'].split(',')
    else:
        pretty_names = samples
    if len(samples) != len(pretty_names):
        log.info(
            "Length of comma-delimited samples field in manifest does not match the length of samples in {0}"
            .format(vcf_info['file']))
        return
    # Create the sample (and batch and project if needed)..
    num_created = 0
    num_skipped = 0
    for pretty_name, vcf_sample in zip(pretty_names, samples):
        log.debug("trying to create {0} sample record".format(vcf_sample))
        sample, created = create_sample(sample_name=pretty_name,
                                        vcf_colname=vcf_sample,
                                        batch_name=sample_info['batch'],
                                        project_name=sample_info['project'],
                                        version=sample_info['version'])
        log.debug("{0} created".format(sample))
        if created:
            num_created += 1
            sts.transition(sample, 'Sample Record Created')
        else:
            num_skipped += 1

        # Create a manfiest object for the sample if one does not exist
        if created or not SampleManifest.objects.filter(
                sample=sample).exists():
            sample_manifest = SampleManifest(sample=sample)
            sample_manifest.load_content(manifest_path)
            sample_manifest.save()
            sts.transition(sample, 'Sample Manifest Created')

    # Publish to channel that this manifest is eligible for processing
    # downstream..
    #if num_created>0 or kwargs.get('force', False):
    SAMPLE_CHANNEL.publish(manifest_path=manifest_path, database=database)

    # Returns whether the sample has been created
    load_dict = {'created': num_created, 'skipped': num_skipped}
    return load_dict
Exemple #8
0
def load_results(manifest_path, database, **kwargs):
    manifest = ManifestReader(manifest_path)

    if not manifest.marked_for_load():
        log.info('Sample not marked for load',
                 extra={
                     'manifest_path': manifest_path,
                 })
        return

    # Ensure the sample section is valid..
    if not check_sample_section(manifest):
        log.info('Manifest sample section is not valid',
                 extra={
                     'manifest_path': manifest_path,
                 })
        return

    sample_info = manifest.section('sample')
    vcf_info = manifest.section('vcf')
    # ignore whatever sample is listed in the manifest and scan the vcf for samples
    vcf_path = os.path.join(os.path.dirname(manifest_path), vcf_info['file'])
    with open(vcf_path) as file_obj:
        log.debug("opening {0} in load_samples".format(vcf_path))
        reader = vcf.Reader(file_obj, preserve_order=False)
        samples = reader.samples
    if 'sample' in sample_info:
        pretty_names = sample_info['sample'].split(',')
    else:
        pretty_names = samples

    for pretty_name, vcf_sample in zip(pretty_names, samples):
        try:
            sample = Sample.objects.get(
                name__iexact=pretty_name,
                batch__name__iexact=sample_info['batch'],
                project__name__iexact=sample_info['project'],
                version=sample_info['version'])
        except Sample.DoesNotExist:
            log.error('Sample does not exist', extra=sample_info)
            return

        #is it already loaded, let's skip for now
        if Result.objects.filter(sample=sample).exists():
            log.debug('{0} exists in results'.format(vcf_sample))
        else:
            log.debug('about to load results for {0}'.format(vcf_sample))

            #STSError: Cannot start transition while already in one.
            successful = False
            while not successful:
                try:
                    with transition(sample,
                                    'Sample Published',
                                    event='Loading Results'):
                        connection = connections[database]
                        cursor = connection.cursor()

                        with open(vcf_path) as fin:
                            stream = ResultStream(fin,
                                                  sample_id=sample.id,
                                                  vcf_sample=vcf_sample)
                            columns = stream.output_columns
                            db_table = Result._meta.db_table
                            pgcopy_batch(stream, db_table, columns, cursor,
                                         database)

                        # Update result count
                        sample.count = sample.results.count()
                        sample.published = True
                        sample.save()
                        successful = True
                except:
                    log.error('STS errors')
                    time.sleep(10)

    vcf_info = manifest.section('vcf')

    # Absolute path relative to the MANIFEST directory
    vcf_path = os.path.join(os.path.dirname(manifest_path), vcf_info['file'])

    # Compare expected MD5 (in manifest) to the file MD5
    if 'md5' in vcf_info:
        vcf_md5 = checks.file_md5(vcf_path)

        if vcf_md5 != vcf_info['md5']:
            log.error('VCF file MD5 does not match expected in manifest',
                      extra={
                          'manifest_path': manifest_path,
                      })

    # Existing samples by the same name of a previous version are unpublished
    # now that is is ready to be published.
    count = Sample.objects.filter(
        name__iexact=sample.name,
        project=sample.project,
        batch=sample.batch,
        version__lt=sample.version).update(published=False)

    if count:
        log.info('{0} previous versions unpublished for {1}'.format(
            count, sample))