Example #1
0
def download_sra(job_id: int) -> None:
    """The main function for the SRA Downloader.

    Fairly straightforward, just downloads the file from SRA.
    """
    job = utils.start_job(job_id)
    original_files = job.original_files.all()

    original_file = original_files.first()
    sample = original_file.samples.first()

    downloaded_files = []
    success = None

    for original_file in original_files:
        exp_path = LOCAL_ROOT_DIR + "/" + job.accession_code
        samp_path = exp_path + "/" + sample.accession_code
        os.makedirs(exp_path, exist_ok=True)
        os.makedirs(samp_path, exist_ok=True)
        dl_file_path = samp_path + "/" + original_file.source_filename
        success = _download_file(original_file, job, dl_file_path)

        if success:
            original_file.set_downloaded(dl_file_path)

            # ENA's file-report endpoint only reports on .fastq files,
            # so we can only check expected md5/size_in_bytes for
            # those files.
            if ".fastq" in original_file.source_filename:
                md5_mismatch = (
                    original_file.expected_md5
                    and original_file.md5 != original_file.expected_md5)
                size_in_bytes_mismatch = (original_file.expected_size_in_bytes
                                          and original_file.size_in_bytes !=
                                          original_file.expected_size_in_bytes)

                if md5_mismatch or size_in_bytes_mismatch:
                    success = False
                    job.failure_reason = "md5 or size_in_bytes didn't match"
                    logger.error(
                        job.failure_reason,
                        expected_md5=original_file.expected_md5,
                        actual_md5=original_file.md5,
                        expected_size_in_bytes=original_file.
                        expected_size_in_bytes,
                        actual_size_in_bytes=original_file.size_in_bytes,
                    )
                    break

            downloaded_files.append(original_file)
        else:
            break

    if success:
        create_processor_job_for_original_files(downloaded_files, job)

    utils.end_downloader_job(job, success)

    return success, downloaded_files
Example #2
0
def download_sra(job_id: int) -> None:
    """The main function for the SRA Downloader.

    Fairly straightforward, just downloads the file from SRA.
    """
    job = utils.start_job(job_id)
    file_assocs = DownloaderJobOriginalFileAssociation.objects.filter(
        downloader_job=job)
    original_files = job.original_files.all()

    original_file = original_files[0]
    sample = original_file.samples.first()
    if _has_unmated_reads(sample.accession_code):
        original_files = _replace_dotsra_with_fastq_files(
            sample, job, original_file)
    else:
        # _replace_dotsra_with_fastq_files returns a list of
        # OriginalFiles so turn the queryset of
        # DownloaderJobOriginalFileAssociations into a list of
        # OriginalFiles to be consistent.
        original_files = [assoc.original_file for assoc in file_assocs]

    downloaded_files = []
    success = None
    for original_file in original_files:
        if original_file.is_downloaded:
            logger.info("File already downloaded!",
                        original_file_id=original_file.id,
                        downloader_job=job_id)
            success = True
            continue

        exp_path = LOCAL_ROOT_DIR + "/" + job.accession_code
        samp_path = exp_path + "/" + sample.accession_code
        os.makedirs(exp_path, exist_ok=True)
        os.makedirs(samp_path, exist_ok=True)
        dl_file_path = samp_path + "/" + original_file.source_filename
        success = _download_file(original_file.source_url, job, dl_file_path)

        if success:
            original_file.is_downloaded = True
            original_file.absolute_file_path = dl_file_path
            original_file.filename = original_file.source_filename
            original_file.is_archive = False
            original_file.calculate_size()
            original_file.calculate_sha1()
            original_file.save()

            downloaded_files.append(original_file)
        else:
            break

    if success:
        create_processor_job_for_original_files(downloaded_files, job)

    utils.end_downloader_job(job, success)

    return success, downloaded_files
Example #3
0
def download_sra(job_id: int) -> None:
    """The main function for the SRA Downloader.

    Fairly straightforward, just downloads the Batch's file from SRA
    and pushes it into Temporary Storage.
    """
    job = utils.start_job(job_id)
    batches = job.batches.all()
    success = True
    job_dir = utils.JOB_DIR_PREFIX + str(job_id)

    # There should only be one batch per SRA job.
    if batches.count() == 1:
        files = File.objects.filter(batch=batches[0])
        # All the files will be downloaded to the same directory
        target_directory = files[0].get_temp_dir(job_dir)
        os.makedirs(target_directory, exist_ok=True)
    elif batches.count() > 1:
        message = "More than one batch found for SRA downloader job. There should only be one."
        logger.error(message, downloader_job=job_id)
        job.failure_reason = message
        success = False
    else:
        message = "No batches found."
        logger.error(message, downloader_job=job_id)
        job.failure_reason = message
        success = False

    if success:
        for file in files:
            target_file_path = file.get_temp_pre_path(job_dir)
            success = _download_file(file, job, target_file_path)

            # If a download fails stop the job and fail gracefully.
            if not success:
                break

            try:
                file.size_in_bytes = os.path.getsize(target_file_path)
                file.save()
                file.upload_raw_file(job_dir)
            except Exception:
                logger.exception("Exception caught while uploading file.",
                                 downloader_job=job.id,
                                 batch=batches[0].id,
                                 file=file.id,
                                 file_name=file.name)
                job.failure_reason = "Exception caught while uploading file."
                success = False
                break

    if success:
        logger.debug(
            "Files for batch %s downloaded and extracted successfully.",
            file.download_url,
            downloader_job=job_id)

    utils.end_job(job, batches, success)
Example #4
0
def download_transcriptome(job_id: int) -> None:
    """The main function for the Transcriptome Index Downloader.

    Two files are needed for the Transcriptome Index Downloader: a
    fasta file and a gtf file. However each pair need to be processed
    into two different sized indices. (See the
    processors.transcriptome_index._create_index function's docstring
    for more info.) Therefore we only download each set once, then
    let each processor find it in the same location.
    """
    job = utils.start_job(job_id)

    file_assocs = DownloaderJobOriginalFileAssociation.objects.filter(
        downloader_job=job)
    files_to_process = []

    for assoc in file_assocs:
        original_file = assoc.original_file

        if original_file.is_archive:
            filename_species = "".join(
                original_file.source_filename.split(".")[:-2])
        else:
            # Does this ever happen?
            filename_species = "".join(
                original_file.source_filename.split(".")[:-1])

        os.makedirs(LOCAL_ROOT_DIR + "/" + filename_species, exist_ok=True)
        dl_file_path = LOCAL_ROOT_DIR + "/" + filename_species + "/" + original_file.source_filename
        job = _download_file(original_file.source_url, dl_file_path, job)

        if not job.success:
            break

        original_file.is_downloaded = True
        original_file.absolute_file_path = dl_file_path
        original_file.filename = original_file.source_filename
        original_file.is_archive = True
        original_file.has_raw = True
        original_file.calculate_size()
        original_file.calculate_sha1()
        original_file.save()
        files_to_process.append(original_file)

    if job.success:
        logger.debug("Files downloaded successfully.", downloader_job=job_id)

        create_long_and_short_processor_jobs(files_to_process)

    utils.end_downloader_job(job, job.success)
Example #5
0
def download_sra(job_id: int) -> None:
    """The main function for the SRA Downloader.

    Fairly straightforward, just downloads the file from SRA.
    """
    job = utils.start_job(job_id)
    file_assocs = DownloaderJobOriginalFileAssociation.objects.filter(
        downloader_job=job)

    downloaded_files = []
    success = None
    for assoc in file_assocs:
        original_file = assoc.original_file

        if original_file.is_downloaded:
            logger.info("File already downloaded!",
                        original_file_id=original_file.id,
                        downloader_job=job_id)
            success = True
            continue

        sample_accession_code = original_file.samples.first().accession_code
        exp_path = LOCAL_ROOT_DIR + '/' + job.accession_code
        samp_path = exp_path + '/' + sample_accession_code
        os.makedirs(exp_path, exist_ok=True)
        os.makedirs(samp_path, exist_ok=True)
        dl_file_path = samp_path + '/' + original_file.source_filename
        success = _download_file(original_file.source_url, job, dl_file_path)

        if success:
            original_file.is_downloaded = True
            original_file.absolute_file_path = dl_file_path
            original_file.filename = original_file.source_filename
            original_file.is_archive = False
            original_file.calculate_size()
            original_file.calculate_sha1()
            original_file.save()

            downloaded_files.append(original_file)
        else:
            break

    if success:
        utils.create_processor_job_for_original_files(downloaded_files, job)

    utils.end_downloader_job(job, success)

    return success, downloaded_files
Example #6
0
def download_array_express(job_id: int) -> None:
    """The main function for the Array Express Downloader.

    Downloads a single zip file containing the .PCL files representing
    samples relating to a single experiement stored in
    ArrayExpress. Each of these files is a separate Batch, so the file
    is unzipped and then each Batch's data is stored in Temporary
    Storage.
    """
    job = utils.start_job(job_id)
    batches = job.batches.all()
    success = True
    job_dir = utils.JOB_DIR_PREFIX + str(job_id)

    if batches.count() > 0:
        files = File.objects.filter(batch__in=batches)
        target_directory = files[0].get_temp_dir(job_dir)
        os.makedirs(target_directory, exist_ok=True)
        target_file_path = files[0].get_temp_download_path(job_dir)
        download_url = files[0].download_url
    else:
        logger.error("No batches found.", downloader_job=job_id)
        success = False

    if success:
        try:
            _verify_batch_grouping(files, job)

            # The files for all of the batches in the grouping are
            # contained within the same zip file. Therefore only
            # download the one.
            _download_file(download_url, target_file_path, job)
            _extract_file(files, job)
        except Exception:
            # Exceptions are already logged and handled.
            # Just need to mark the job as failed.
            success = False

    if success:
        logger.debug("File %s downloaded and extracted successfully.",
                     download_url,
                     downloader_job=job_id)

    utils.end_job(job, batches, success)
Example #7
0
def download_sra(job_id: int) -> None:
    """The main function for the SRA Downloader.

    Fairly straightforward, just downloads the file from SRA.
    """
    job = utils.start_job(job_id)
    original_files = job.original_files.all()

    original_file = original_files.first()
    sample = original_file.samples.first()
    if _has_unmated_reads(sample.accession_code):
        original_files = _replace_dotsra_with_fastq_files(
            sample, job, original_file)

    downloaded_files = []
    success = None
    for original_file in original_files:
        exp_path = LOCAL_ROOT_DIR + "/" + job.accession_code
        samp_path = exp_path + "/" + sample.accession_code
        os.makedirs(exp_path, exist_ok=True)
        os.makedirs(samp_path, exist_ok=True)
        dl_file_path = samp_path + "/" + original_file.source_filename
        success = _download_file(original_file.source_url, job, dl_file_path)

        if success:
            original_file.set_downloaded(dl_file_path)
            downloaded_files.append(original_file)
        else:
            break

    if success:
        create_processor_job_for_original_files(downloaded_files, job)

    utils.end_downloader_job(job, success)

    return success, downloaded_files
Example #8
0
def download_array_express(job_id: int) -> None:
    """The main function for the Array Express Downloader.

    Downloads a single zip file containing the .PCL files representing
    samples relating to a single experiement stored in
    ArrayExpress.
    """
    job = utils.start_job(job_id)
    success = True

    file_assocs = DownloaderJobOriginalFileAssociation.objects.filter(downloader_job=job)
    # AE will have multiple files per DownloaderJob, but they are all
    # pieces of the same zip file so they're all referencing the same
    # URL.
    original_file = file_assocs[0].original_file
    url = original_file.source_url
    accession_code = job.accession_code

    # First, get all the unique sample archive URLs.
    # There may be more than one!
    # Then, unpack all the ones downloaded.
    # Then create processor jobs!

    og_files = []
    # The files for all of the samples are
    # contained within the same zip file. Therefore only
    # download the one.
    os.makedirs(LOCAL_ROOT_DIR + '/' + accession_code, exist_ok=True)

    # Add a timestamp in milliseconds to filename to prevent multiple jobs from using the same file.
    filename = url.split('/')[-1] + "." + str(int(time.time() * 1000))
    dl_file_path = LOCAL_ROOT_DIR + '/' + accession_code + '/' + filename + ".zip"
    _download_file(url, dl_file_path, job)

    extracted_files = _extract_files(dl_file_path, accession_code, job)

    for og_file in extracted_files:
        try:
            original_file = OriginalFile.objects.get(
                source_filename=og_file['filename'], source_url=original_file.source_url)
            original_file.is_downloaded = True
            original_file.is_archive = False
            original_file.absolute_file_path = og_file['absolute_path']
            original_file.filename = og_file['absolute_path'].split('/')[-1]
            original_file.calculate_size()
            original_file.save()
            original_file.calculate_sha1()
            og_files.append(original_file)
        except Exception:
            # The suspicion is that there are extra files related to
            # another experiment, that we don't want associated with
            # this one.
            logger.debug("Found a file we didn't have an OriginalFile for! Why did this happen?: "
                        + og_file['filename'],
                        downloader_job=job_id)
            os.remove(og_file["absolute_path"])
            continue

        sample_objects = Sample.objects.filter(originalfile=original_file).order_by('created_at')
        if sample_objects.count() > 1:
            logger.warn("Found an Array Express OriginalFile with more than one sample: %s",
                        filename,
                        downloader_job=job_id)

        # If the file is a .CEL file, it is the ultimate
        # source of truth about the sample's platform.
        sample_object = sample_objects[0]
        if og_file["filename"].upper()[-4:] == ".CEL" and sample_object.has_raw:
            cel_file_platform = None
            platform_accession_code = "UNSUPPORTED"
            try:
                cel_file_platform = microarray.get_platform_from_CEL(
                    original_file.absolute_file_path)

                for platform in get_supported_microarray_platforms():
                    if platform["platform_accession"] == cel_file_platform:
                        platform_accession_code = platform["platform_accession"]
            except Exception as e:
                platform_accession_code = "UNDETERMINABLE"
                logger.warn("Unable to determine platform from CEL file: "
                            + original_file.absolute_file_path,
                            downloader_job=job_id)
            if platform_accession_code == "UNSUPPORTED":
                logger.error("Found a raw .CEL file with an unsupported platform!",
                             file_name=original_file.absolute_file_path,
                             sample=sample_object.id,
                             downloader_job=job_id,
                             cel_file_platform=cel_file_platform)
                job.failure_reason = ("Found a raw .CEL file with an unsupported platform: "
                                      + original_file.absolute_file_path + " ("
                                      + str(cel_file_platform) + ")")
                job.no_retry = True
                success = False

                # The file is unsupported, delete it!
                original_file.delete_local_file()
                original_file.delete()
            elif platform_accession_code == "UNDETERMINABLE":
                # If we cannot determine the platform from the
                # .CEL file, the platform discovered via metadata
                # may be correct so just leave it be.
                pass
            else:
                # We determined the file was collected with a supported Affymetrix platform.
                sample_object.platform_accession_code = platform_accession_code
                sample_object.platform_name = get_readable_affymetrix_names()[
                    platform_accession_code]

            # However, if the filename contains '.CEL' we know
            # it's an Affymetrix Microarray
            sample_object.technology = "MICROARRAY"
            sample_object.manufacterer = "AFFYMETRIX"
            sample_object.save()

    if success:
        logger.debug("File downloaded and extracted successfully.",
                     url=url,
                     downloader_job=job_id)

        utils.create_processor_jobs_for_original_files(og_files, job)

    utils.end_downloader_job(job, success)
    def test_dharma(self):

        dlj1 = DownloaderJob()
        dlj1.accession_code = 'D1'
        dlj1.worker_id = get_instance_id()
        dlj1.start_time = datetime.datetime.now()
        dlj1.save()

        dlj2 = DownloaderJob()
        dlj2.accession_code = 'D2'
        dlj2.worker_id = get_instance_id()
        dlj2.start_time = datetime.datetime.now()
        dlj2.save()

        dlj3 = DownloaderJob()
        dlj3.accession_code = 'D3'
        dlj3.worker_id = get_instance_id()
        dlj3.save()

        original_file = OriginalFile()
        original_file.source_url = "ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/MEXP/E-MEXP-433/E-MEXP-433.raw.1.zip"
        original_file.source_filename = "Waldhof_020604_R30_01-2753_U133A.CEL"
        original_file.save()

        assoc = DownloaderJobOriginalFileAssociation()
        assoc.original_file = original_file
        assoc.downloader_job = dlj3
        assoc.save()

        sample = Sample()
        sample.accession_code = 'Blahblahblah'
        sample.technology = "MICROARRAY"
        sample.manufacturer = "AFFYMETRIX"
        sample.has_raw = True
        sample.platform_accession_code = "hgu133a"
        sample.save()

        OriginalFileSampleAssociation.objects.get_or_create(
            sample=sample, original_file=original_file)

        exited = False
        try:
            utils.start_job(dlj3.id,
                            max_downloader_jobs_per_node=2,
                            force_harakiri=True)
        except SystemExit as e:
            # This is supposed to happen!
            self.assertTrue(True)
            exited = True
        except Exception as e:
            # This isn't!
            self.assertTrue(False)
        self.assertTrue(exited)

        exited = False
        try:
            utils.start_job(dlj3.id,
                            max_downloader_jobs_per_node=15,
                            force_harakiri=True)
        except SystemExit as e:
            # This is not supposed to happen!
            self.assertTrue(False)
            exited = True
        except Exception as e:
            # This is!
            self.assertTrue(True)
        self.assertFalse(exited)
Example #10
0
def download_geo(job_id: int) -> None:
    """The main function for the GEO Downloader.

    Downloads a single tar file containing the files representing
    samples relating to a single experiment stored in
    GEO.
    """
    job = utils.start_job(job_id)
    accession_code = job.accession_code
    original_file = job.original_files.first()

    if not original_file:
        job.failure_reason = "No files associated with the job."
        logger.error("No files associated with the job.", downloader_job=job_id)
        utils.end_downloader_job(job, success=False)
        return

    url = original_file.source_url
    related_samples = original_file.samples.exclude(technology="RNA-SEQ")

    # First, download the sample archive URL.
    # Then, unpack all the ones downloaded.
    # Then create processor jobs!

    # The files for all of the samples are contained within the same zip file. Therefore only
    # download the one.
    os.makedirs(LOCAL_ROOT_DIR + "/" + accession_code, exist_ok=True)
    dl_file_path = LOCAL_ROOT_DIR + "/" + accession_code + "/" + url.split("/")[-1]

    logger.debug("Starting to download: " + url, job_id=job_id, accession_code=accession_code)
    _download_file(url, dl_file_path, job)
    original_file.absolute_file_path = dl_file_path
    original_file.is_downloaded = True
    original_file.save()

    unpacked_sample_files = []

    try:
        # enumerate all files inside the archive
        archived_files = list(ArchivedFile(dl_file_path).get_files())
    except FileExtractionError as e:
        job.failure_reason = e
        logger.exception(
            "Error occurred while extracting file.", path=dl_file_path, exception=str(e)
        )
        utils.end_downloader_job(job, success=False)
        return

    for og_file in archived_files:
        sample = og_file.get_sample()

        # We don't want RNA-Seq data from GEO:
        # https://github.com/AlexsLemonade/refinebio/issues/966
        if sample and sample.technology == "RNA-SEQ":
            logger.warn("RNA-Seq sample found in GEO downloader job.", sample=sample)
            continue

        if not sample and (
            not og_file.is_processable() or og_file.experiment_accession_code() != accession_code
        ):
            # skip the files that we know are not processable and can't be associated with a sample
            # also skip the files were we couldn't find a sample and they don't mention the current experiment
            continue

        potential_existing_file = OriginalFile.objects.filter(
            source_filename=original_file.source_filename,
            filename=og_file.filename,
            is_archive=False,
        ).first()
        if potential_existing_file:
            # We've already created this record, let's see if we actually
            # needed to download it or if we just got it because we needed
            # a file in the same archive.
            if potential_existing_file.needs_processing():
                if not potential_existing_file.is_downloaded:
                    potential_existing_file.is_downloaded = True
                    potential_existing_file.save()

                unpacked_sample_files.append(potential_existing_file)
            continue

        # Then this is a new file and we should create an original file for it
        actual_file = OriginalFile()
        actual_file.is_downloaded = True
        actual_file.is_archive = False
        actual_file.absolute_file_path = og_file.file_path
        actual_file.filename = og_file.filename
        actual_file.calculate_size()
        actual_file.calculate_sha1()
        actual_file.has_raw = True
        actual_file.source_url = original_file.source_url
        actual_file.source_filename = original_file.source_filename
        actual_file.save()

        # try to see if the file should be associated with a sample
        if sample:
            original_file_sample_association = OriginalFileSampleAssociation()
            original_file_sample_association.sample = sample
            original_file_sample_association.original_file = actual_file
            original_file_sample_association.save()
        else:
            # if not, we can associate this file with all samples in the experiment
            for sample in related_samples:
                original_file_sample_association = OriginalFileSampleAssociation()
                original_file_sample_association.sample = sample
                original_file_sample_association.original_file = actual_file
                original_file_sample_association.save()

        unpacked_sample_files.append(actual_file)

    if len(unpacked_sample_files) > 0:
        success = True
        logger.debug(
            "File downloaded and extracted successfully.",
            url=url,
            dl_file_path=dl_file_path,
            downloader_job=job_id,
        )
    else:
        success = False
        logger.info(
            "Unable to extract any files.",
            url=url,
            dl_file_path=dl_file_path,
            downloader_job=job_id,
        )
        job.failure_reason = "Failed to extract any downloaded files."

    if success:
        create_processor_jobs_for_original_files(unpacked_sample_files, job)

    if original_file.is_archive:
        original_file.delete_local_file()

    utils.end_downloader_job(job, success)

    return success
Example #11
0
def download_transcriptome(job_id: int) -> None:
    """The main function for the Transcriptome Index Downloader.

    Two files are needed for the Transcriptome Index Downloader: a
    fasta file and a gtf file. However each pair need to be processed
    into two different sized indices. (See the
    processors.transcriptome_index._create_index function's docstring
    for more info.) Therefore we only download each set once, then
    let each processor find it in the same location.
    """
    job = utils.start_job(job_id)

    file_assocs = DownloaderJobOriginalFileAssociation.objects.filter(
        downloader_job=job)
    long_files_to_process = []
    short_files_to_process = []

    for assoc in file_assocs:
        long_original_file = assoc.original_file

        if long_original_file.is_archive:
            filename_species = "".join(
                long_original_file.source_filename.split(".")[:-2])
        else:
            # Does this ever happen?
            filename_species = "".join(
                long_original_file.source_filename.split(".")[:-1])

        # First download the files and make the original files for the
        # long transciptome index.
        long_dir = os.path.join(LOCAL_ROOT_DIR, filename_species + "_long")
        os.makedirs(long_dir, exist_ok=True)
        long_dl_file_path = os.path.join(long_dir,
                                         long_original_file.source_filename)
        job = _download_file(long_original_file.source_url, long_dl_file_path,
                             job)

        if not job.success:
            break

        long_original_file.is_downloaded = True
        long_original_file.absolute_file_path = long_dl_file_path
        long_original_file.filename = long_original_file.source_filename
        long_original_file.has_raw = True
        long_original_file.calculate_size()
        long_original_file.calculate_sha1()
        long_original_file.save()
        long_files_to_process.append(long_original_file)

        # Next copy the files to another directory and create the
        # original files for the short transcriptome index.
        short_dir = os.path.join(LOCAL_ROOT_DIR, filename_species + "_short")
        os.makedirs(short_dir, exist_ok=True)
        short_dl_file_path = os.path.join(short_dir,
                                          long_original_file.source_filename)
        shutil.copyfile(long_dl_file_path, short_dl_file_path)

        short_original_file = OriginalFile(
            source_filename=long_original_file.source_filename,
            source_url=long_original_file.source_url,
            is_downloaded=True,
            absolute_file_path=short_dl_file_path,
            filename=long_original_file.filename,
            has_raw=True,
        )
        short_original_file.calculate_size()
        short_original_file.calculate_sha1()
        short_original_file.save()
        short_files_to_process.append(short_original_file)

    if job.success:
        logger.debug("Files downloaded successfully.", downloader_job=job_id)

        create_long_and_short_processor_jobs(job, long_files_to_process,
                                             short_files_to_process)

    utils.end_downloader_job(job, job.success)
Example #12
0
def download_geo(job_id: int) -> None:
    """The main function for the GEO Downloader.

    Downloads a single tar file containing the files representing
    samples relating to a single experiement stored in
    GEO.
    """
    job = utils.start_job(job_id)

    file_assocs = DownloaderJobOriginalFileAssociation.objects.filter(
        downloader_job=job)

    original_file = file_assocs[0].original_file
    url = original_file.source_url
    accession_code = job.accession_code

    sample_assocs = OriginalFileSampleAssociation.objects.filter(
        original_file=original_file)
    related_samples = Sample.objects.filter(
        id__in=sample_assocs.values('sample_id'))

    # First, download the sample archive URL.
    # Then, unpack all the ones downloaded.
    # Then create processor jobs!

    # The files for all of the samples are
    # contained within the same zip file. Therefore only
    # download the one.
    os.makedirs(LOCAL_ROOT_DIR + '/' + accession_code, exist_ok=True)
    dl_file_path = LOCAL_ROOT_DIR + '/' + accession_code + '/' + url.split(
        '/')[-1]

    logger.debug("Starting to download: " + url,
                 job_id=job_id,
                 accession_code=accession_code)
    _download_file(url, dl_file_path, job)
    original_file.absolute_file_path = dl_file_path
    original_file.is_downloaded = True
    original_file.save()

    has_raw = True
    unpacked_sample_files = []

    # These files are tarred, and also subsequently gzipped
    if '.tar' in dl_file_path:
        try:
            extracted_files = _extract_tar(dl_file_path, accession_code)
        except Exception as e:
            job.failure_reason = e
            logger.exception("Error occured while extracting tar file.",
                             path=dl_file_path,
                             exception=str(e))
            utils.end_downloader_job(job, success=False)
            return

        for og_file in extracted_files:

            filename = og_file['filename']
            if '_' in filename:
                sample_id = filename.split('_')[0]
            else:
                sample_id = filename.split('.')[0]

            try:
                sample = Sample.objects.get(accession_code=sample_id)
            except Exception as e:
                # We don't have this sample, but it's not a total failure. This happens.
                continue

            try:
                # Files from the GEO supplemental file are gzipped inside of the tarball. Great!
                archive_file = OriginalFile.objects.get(
                    source_filename__contains=sample_id)
                archive_file.is_downloaded = True
                archive_file.is_archive = True
                archive_file.absolute_file_path = og_file['absolute_path']
                archive_file.calculate_size()
                archive_file.calculate_sha1()
                archive_file.save()

                if '.gz' in og_file['filename']:
                    extracted_subfile = _extract_gz(og_file['absolute_path'],
                                                    accession_code)
                else:
                    extracted_subfile = [og_file]

                actual_file = OriginalFile()
                actual_file.is_downloaded = True
                actual_file.is_archive = False
                actual_file.absolute_file_path = extracted_subfile[0][
                    'absolute_path']
                actual_file.filename = extracted_subfile[0]['filename']
                actual_file.calculate_size()
                actual_file.calculate_sha1()
                actual_file.has_raw = True
                actual_file.source_url = original_file.source_url
                actual_file.source_filename = original_file.source_filename
                actual_file.save()

                original_file_sample_association = OriginalFileSampleAssociation(
                )
                original_file_sample_association.sample = sample
                original_file_sample_association.original_file = actual_file
                original_file_sample_association.save()

                archive_file.delete_local_file()
                archive_file.is_downloaded = False
                archive_file.save()

                unpacked_sample_files.append(actual_file)
            except Exception as e:
                # TODO - is this worth failing a job for?
                logger.debug(
                    "Found a file we didn't have an OriginalFile for! Why did this happen?: "
                    + og_file['filename'],
                    exc_info=1,
                    file=og_file['filename'],
                    sample_id=sample_id,
                    accession_code=accession_code)
                # If we don't know why we have it, get rid of it.
                os.remove(og_file["absolute_path"])

    # This is a .tgz file.
    elif '.tgz' in dl_file_path:
        # If this is the MINiML file, it has been preprocessed
        if '_family.xml.tgz' in dl_file_path:
            has_raw = False

        try:
            extracted_files = _extract_tgz(dl_file_path, accession_code)
        except Exception as e:
            job.failure_reason = e
            logger.exception("Error occured while extracting tgz file.",
                             path=dl_file_path,
                             exception=str(e))
            utils.end_downloader_job(job, success=False)
            return

        for og_file in extracted_files:

            if '.txt' in og_file['filename']:
                try:
                    gsm_id = og_file['filename'].split('-')[0]
                    sample = Sample.objects.get(accession_code=gsm_id)
                except Exception as e:
                    os.remove(og_file["absolute_path"])
                    continue

                actual_file = OriginalFile()
                actual_file.is_downloaded = True
                actual_file.is_archive = False
                actual_file.absolute_file_path = og_file['absolute_path']
                actual_file.filename = og_file['filename']
                actual_file.calculate_size()
                actual_file.calculate_sha1()
                actual_file.has_raw = has_raw
                actual_file.source_url = original_file.source_url
                actual_file.source_filename = original_file.source_filename
                actual_file.save()

                original_file_sample_association = OriginalFileSampleAssociation(
                )
                original_file_sample_association.sample = sample
                original_file_sample_association.original_file = actual_file
                original_file_sample_association.save()

                unpacked_sample_files.append(actual_file)

    # These files are only gzipped.
    # These are generally the _actually_ raw (rather than the non-raw data in a RAW file) data
    elif '.gz' in dl_file_path:
        try:
            extracted_files = _extract_gz(dl_file_path, accession_code)
        except Exception as e:
            job.failure_reason = e
            logger.exception("Error occured while extracting gz file.",
                             path=dl_file_path,
                             exception=str(e))
            utils.end_downloader_job(job, success=False)
            return

        for og_file in extracted_files:

            filename = og_file['filename']
            sample_id = filename.split('.')[0]

            try:
                # The archive we downloaded
                archive_file = OriginalFile.objects.get(
                    source_filename__contains=filename)
                archive_file.is_downloaded = True
                archive_file.is_archive = True
                archive_file.absolute_file_path = dl_file_path
                archive_file.calculate_size()
                archive_file.calculate_sha1()
                archive_file.save()

                actual_file = OriginalFile()
                actual_file.is_downloaded = True
                actual_file.is_archive = False
                actual_file.absolute_file_path = og_file['absolute_path']
                actual_file.filename = og_file['filename']
                actual_file.calculate_size()
                actual_file.calculate_sha1()
                actual_file.has_raw = True
                actual_file.source_url = original_file.source_url
                actual_file.source_filename = original_file.source_filename
                actual_file.save()

                for sample in related_samples:
                    new_association = OriginalFileSampleAssociation()
                    new_association.original_file = actual_file
                    new_association.sample = sample
                    new_association.save()

                archive_file.delete_local_file()
                archive_file.is_downloaded = False
                archive_file.save()

                unpacked_sample_files.append(actual_file)
            except Exception as e:
                logger.debug(
                    "Found a file we didn't have an OriginalFile for! Why did this happen?: "
                    + og_file['filename'],
                    exc_info=1,
                    file=og_file['filename'],
                    sample_id=sample_id,
                    accession_code=accession_code)
                os.remove(og_file["absolute_path"])

    # This is probably just a .txt file
    else:
        filename = dl_file_path.split('/')[-1]
        sample_id = filename.split('_')[0]

        actual_file = OriginalFile()
        actual_file.is_downloaded = True
        actual_file.is_archive = False
        actual_file.absolute_file_path = dl_file_path
        actual_file.filename = filename
        actual_file.calculate_size()
        actual_file.calculate_sha1()
        actual_file.has_raw = True
        actual_file.source_url = original_file.source_url
        actual_file.source_filename = original_file.source_filename
        actual_file.save()

        for sample in related_samples:
            new_association = OriginalFileSampleAssociation()
            new_association.original_file = actual_file
            new_association.sample = sample
            new_association.save()

        unpacked_sample_files.append(actual_file)

    if len(unpacked_sample_files) > 0:
        success = True
        logger.debug("File downloaded and extracted successfully.",
                     url=url,
                     dl_file_path=dl_file_path,
                     downloader_job=job_id)
    else:
        success = False
        logger.info("Unable to extract any files.",
                    url=url,
                    dl_file_path=dl_file_path,
                    downloader_job=job_id)
        job.failure_reason = "Failed to extract any downloaded files."

    if success:
        utils.create_processor_jobs_for_original_files(unpacked_sample_files,
                                                       job)

    if original_file.is_archive:
        original_file.delete_local_file()

    utils.end_downloader_job(job, success)

    return success
Example #13
0
def download_transcriptome(job_id: int) -> None:
    """The main function for the Transcriptome Index Downloader.

    Two files are needed for the Transcriptome Index Downloader: a
    fasta file and a gtf file. However each pair need to be processed
    into two different sized indices. (See the
    processors.transcriptome_index._create_index function's docstring
    for more info.) Therefore we only download each set once, then
    push it to Temporary Storage twice.
    """
    job = utils.start_job(job_id)
    batches = job.batches.all()
    success = True
    job_dir = utils.JOB_DIR_PREFIX + str(job_id)

    try:
        first_fasta_file = File.objects.get(batch=batches[0],
                                            raw_format__exact="fa.gz")
        first_gtf_file = File.objects.get(batch=batches[0],
                                          raw_format__exact="gtf.gz")
        second_fasta_file = File.objects.get(batch=batches[1],
                                             raw_format__exact="fa.gz")
        second_gtf_file = File.objects.get(batch=batches[1],
                                           raw_format__exact="gtf.gz")
        os.makedirs(first_fasta_file.get_temp_dir(job_dir), exist_ok=True)
    except Exception:
        logger.exception(
            "Failed to retrieve all expected files from database.",
            downloader_job=job.id)
        job.failure_reason = "Failed to retrieve all expected files from database."
        success = False

    if success:
        try:
            _verify_files(first_fasta_file, second_fasta_file, job)
            _verify_files(first_gtf_file, second_gtf_file, job)

            # The two Batches share the same fasta and gtf files, so
            # only download each one once
            _download_file(first_fasta_file.download_url,
                           first_fasta_file.get_temp_pre_path(job_dir), job)
            _download_file(first_gtf_file.download_url,
                           first_gtf_file.get_temp_pre_path(job_dir), job)

            # Then create symlinks so the files for the second Batch
            # can be found where they will be expected to.
            try:
                os.symlink(first_fasta_file.get_temp_pre_path(job_dir),
                           second_fasta_file.get_temp_pre_path(job_dir))
                os.symlink(first_gtf_file.get_temp_pre_path(job_dir),
                           second_gtf_file.get_temp_pre_path(job_dir))
            except Exception:
                logger.exception("Exception caught while creating symlinks.",
                                 downloader_job=job.id)
                job.failure_reason = "Exception caught while creating symlinks."
                raise

            _upload_files(job_dir, [
                first_fasta_file, first_gtf_file, second_fasta_file,
                second_gtf_file
            ], job)
        except Exception:
            # Exceptions are already logged and handled.
            # Just need to mark the job as failed.
            success = False

    if success:
        logger.debug("Files %s and %s downloaded successfully.",
                     first_fasta_file,
                     first_gtf_file,
                     downloader_job=job_id)

    utils.end_job(job, batches, success)