Beispiel #1
0
    def _generate_files(self, species: Dict) -> None:
        url_builder = ensembl_url_builder_factory(species)
        fasta_download_url = url_builder.build_transcriptome_url()
        gtf_download_url = url_builder.build_gtf_url()

        # Getting the object will ensure it is created in the DB.
        Organism.get_or_create_object_for_id(url_builder.taxonomy_id)

        all_new_files = []

        fasta_filename = url_builder.filename_species + ".fa.gz"
        original_file = OriginalFile()
        original_file.source_filename = fasta_filename
        original_file.source_url = fasta_download_url
        original_file.is_archive = True
        original_file.is_downloaded = False
        original_file.save()
        all_new_files.append(original_file)

        gtf_filename = url_builder.filename_species + ".gtf.gz"
        original_file = OriginalFile()
        original_file.source_filename = gtf_filename
        original_file.source_url = gtf_download_url
        original_file.is_archive = True
        original_file.is_downloaded = False
        original_file.save()
        all_new_files.append(original_file)

        return all_new_files
Beispiel #2
0
    def _generate_files(self, species: Dict) -> None:
        url_builder = ensembl_url_builder_factory(species)
        fasta_download_url = url_builder.build_transcriptome_url()
        gtf_download_url = url_builder.build_gtf_url()
        
        platform_accession_code = species.pop("division")
        self._clean_metadata(species)

        all_new_files = []

        fasta_filename = url_builder.filename_species + ".fa.gz"
        original_file = OriginalFile()
        original_file.source_filename = fasta_filename
        original_file.source_url = fasta_download_url
        original_file.is_archive = True
        original_file.is_downloaded = False
        original_file.save()
        all_new_files.append(original_file)

        gtf_filename = url_builder.filename_species + ".gtf.gz"
        original_file = OriginalFile()
        original_file.source_filename = gtf_filename
        original_file.source_url = gtf_download_url
        original_file.is_archive = True
        original_file.is_downloaded = False
        original_file.save()
        all_new_files.append(original_file)

        return all_new_files
Beispiel #3
0
    def test_download_file(self):
        dlj = DownloaderJob()
        dlj.accession_code = "ERR036"
        dlj.save()

        og = OriginalFile()
        og.source_filename = "ERR036000.fastq.gz"
        og.source_url = "ftp.sra.ebi.ac.uk/vol1/fastq/ERR036/ERR036000/ERR036000_1.fastq.gz"
        og.is_archive = True
        og.save()

        sample = Sample()
        sample.accession_code = "ERR036000"
        sample.save()

        assoc = OriginalFileSampleAssociation()
        assoc.sample = sample
        assoc.original_file = og
        assoc.save()

        assoc = DownloaderJobOriginalFileAssociation()
        assoc.downloader_job = dlj
        assoc.original_file = og
        assoc.save()

        result, downloaded_files = sra.download_sra(dlj.pk)

        self.assertTrue(result)
        self.assertEqual(downloaded_files[0].sha1,
                         "1dfe5460a4101fe87feeffec0cb2e053f6695961")
        self.assertTrue(os.path.exists(downloaded_files[0].absolute_file_path))
Beispiel #4
0
 def test_download_file_ncbi(self):
     dlj = DownloaderJob()
     dlj.accession_code = "SRR9117853"
     dlj.save()
     og = OriginalFile()
     og.source_filename = "SRR9117853.sra"
     og.source_url = "[email protected]:/sra/sra-instant/reads/ByRun/sra/SRR/SRR9117/SRR9117853/SRR9117853.sra"
     og.is_archive = True
     og.save()
     sample = Sample()
     sample.accession_code = "SRR9117853"
     sample.save()
     assoc = OriginalFileSampleAssociation()
     assoc.sample = sample
     assoc.original_file = og
     assoc.save()
     assoc = DownloaderJobOriginalFileAssociation()
     assoc.downloader_job = dlj
     assoc.original_file = og
     assoc.save()
     result, downloaded_files = sra.download_sra(dlj.pk)
     utils.end_downloader_job(dlj, result)
     self.assertTrue(result)
     self.assertEqual(downloaded_files[0].sha1, "e7ad484fe6f134ba7d1b2664e58cc15ae5a958cc")
     self.assertTrue(os.path.exists(downloaded_files[0].absolute_file_path))
Beispiel #5
0
    def test_download_file_swapper(self, mock_send_job):
        mock_send_job.return_value = None

        dlj = DownloaderJob()
        dlj.accession_code = "DRR002116"
        dlj.save()
        og = OriginalFile()
        og.source_filename = "DRR002116.sra"
        og.source_url = "[email protected]:/sra/sra-instant/reads/ByRun/sra/DRR/DRR002/DRR002116/DRR002116.sra"
        og.is_archive = True
        og.save()
        sample = Sample()
        sample.accession_code = 'DRR002116'
        sample.save()
        assoc = OriginalFileSampleAssociation()
        assoc.sample = sample
        assoc.original_file = og
        assoc.save()
        assoc = DownloaderJobOriginalFileAssociation()
        assoc.downloader_job = dlj
        assoc.original_file = og
        assoc.save()
        result = sra._download_file(og.source_url,
                                    dlj,
                                    "/tmp",
                                    force_ftp=False)
        self.assertTrue(result)
Beispiel #6
0
 def test_download_file_ncbi(self, mock_send_job):
     mock_send_job.return_value = None
     
     dlj = DownloaderJob()
     dlj.accession_code = "DRR002116"
     dlj.save()
     og = OriginalFile()
     og.source_filename = "DRR002116.sra"
     og.source_url = "[email protected]:/sra/sra-instant/reads/ByRun/sra/DRR/DRR002/DRR002116/DRR002116.sra"
     og.is_archive = True
     og.save()
     sample = Sample()
     sample.accession_code = 'DRR002116'
     sample.save()
     assoc = OriginalFileSampleAssociation()
     assoc.sample = sample
     assoc.original_file = og
     assoc.save()
     assoc = DownloaderJobOriginalFileAssociation()
     assoc.downloader_job = dlj
     assoc.original_file = og
     assoc.save()
     result, downloaded_files = sra.download_sra(dlj.pk)
     utils.end_downloader_job(dlj, result)
     self.assertTrue(result)
     self.assertEqual(downloaded_files[0].sha1, 'd5374e7fe047d4f76b165c3f5148ab2df9d42cea')
     self.assertTrue(os.path.exists(downloaded_files[0].absolute_file_path))
Beispiel #7
0
    def test_download_file(self, mock_send_job):
        mock_send_job.return_value = None
        
        dlj = DownloaderJob()
        dlj.accession_code = "ERR036"
        dlj.save()

        og = OriginalFile()
        og.source_filename = "ERR036000.fastq.gz"
        og.source_url = "ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR036/ERR036000/ERR036000_1.fastq.gz"
        og.is_archive = True
        og.save()

        sample = Sample()
        sample.accession_code = 'ERR036000'
        sample.save()

        assoc = OriginalFileSampleAssociation()
        assoc.sample = sample
        assoc.original_file = og
        assoc.save()

        assoc = DownloaderJobOriginalFileAssociation()
        assoc.downloader_job = dlj
        assoc.original_file = og
        assoc.save()

        success = sra.download_sra(dlj.pk)
Beispiel #8
0
def _get_actual_file_if_queueable(
        extracted_subfile: Dict,
        original_file: OriginalFile,
        samples: List[Sample]) -> OriginalFile:
    """Returns the actual file from the archive if it should be queued.

    If the file has been processed or has an unstarted DownloaderJob,
    None will be returned.

    `extracted_subfile` should be a Dict containing metadata about the
    file that was extracted from an archive.

    `original_file` should be the file associated with the CURRENT
    DownloaderJob.

    `samples` are the samples that the actual file should be associated
    with if it has to be created.
    """
    # Check to see if we've made this original file before:
    potential_existing_files = OriginalFile.objects.filter(
        source_filename=original_file.source_filename,
        filename=extracted_subfile['filename'],
        is_archive=False
    )
    if potential_existing_files.count() > 0:
        # We've already created this record, let's see if we actually
        # needed to download it or if we just got it because we needed
        # a file in the same archive.
        actual_file = potential_existing_files[0]

        if actual_file.needs_processing():
            if not actual_file.is_downloaded:
                actual_file.is_downloaded = True
                actual_file.save()
            return actual_file
        else:
            return None

    else:
        actual_file = OriginalFile()
        actual_file.is_downloaded = True
        actual_file.is_archive = False
        actual_file.absolute_file_path = extracted_subfile['absolute_path']
        actual_file.filename = extracted_subfile['filename']
        actual_file.calculate_size()
        actual_file.calculate_sha1()
        actual_file.has_raw = True
        actual_file.source_url = original_file.source_url
        actual_file.source_filename = original_file.source_filename
        actual_file.save()

        for sample in samples:
            original_file_sample_association = OriginalFileSampleAssociation()
            original_file_sample_association.sample = sample
            original_file_sample_association.original_file = actual_file
            original_file_sample_association.save()

        return actual_file
Beispiel #9
0
    def test_download_file_unmated_reads(self):
        dlj = DownloaderJob()
        dlj.accession_code = "SRR1603661"
        dlj.save()
        og_1 = OriginalFile()
        og_1.source_filename = "SRR1603661_1.fastq.gz"
        og_1.source_url = "ftp.sra.ebi.ac.uk/vol1/fastq/SRR160/001/SRR1603661/SRR1603661_1.fastq.gz"
        og_1.expected_md5 = "502a9a482bfa5aa75865ccc0105ad13c"
        og_1.expected_size_in_bytes = 6751980628
        og_1.is_archive = True
        og_1.save()
        og_2 = OriginalFile()
        og_2.source_filename = "SRR1603661_2.fastq.gz"
        og_2.source_url = "ftp.sra.ebi.ac.uk/vol1/fastq/SRR160/001/SRR1603661/SRR1603661_2.fastq.gz"
        og_1.expected_md5 = "fffd24457418d255991f54ec82a39d57"
        og_1.expected_size_in_bytes = 6949912932
        og_2.is_archive = True
        og_2.save()
        sample = Sample()
        sample.accession_code = "SRR1603661"
        sample.save()
        assoc = OriginalFileSampleAssociation()
        assoc.sample = sample
        assoc.original_file = og_1
        assoc.save()
        assoc = DownloaderJobOriginalFileAssociation()
        assoc.downloader_job = dlj
        assoc.original_file = og_1
        assoc.save()
        assoc = OriginalFileSampleAssociation()
        assoc.sample = sample
        assoc.original_file = og_2
        assoc.save()
        assoc = DownloaderJobOriginalFileAssociation()
        assoc.downloader_job = dlj
        assoc.original_file = og_2
        assoc.save()
        result, downloaded_files = sra.download_sra(dlj.pk)
        utils.end_downloader_job(dlj, result)

        self.assertTrue(result)
        self.assertEqual(downloaded_files[0].sha1,
                         "52bf22472069d04fa7767429f6ab78ebd10c0152")
        self.assertTrue(os.path.exists(downloaded_files[0].absolute_file_path))
    def test_download_file(self, mock_send_job):
        mock_send_job.return_value = None
        dlj = DownloaderJob()
        dlj.save()
        og = OriginalFile()
        og.source_filename = "Aegilops_tauschii.ASM34733v1.37.gtf.gz"
        og.source_url = self.gtf_download_url
        og.is_archive = True
        og.save()

        assoc = DownloaderJobOriginalFileAssociation()
        assoc.downloader_job = dlj
        assoc.original_file = og
        assoc.save()

        transcriptome_index.download_transcriptome(dlj.pk)
Beispiel #11
0
 def test_download_file_swapper(self):
     dlj = DownloaderJob()
     dlj.accession_code = "SRR9117853"
     dlj.save()
     og = OriginalFile()
     og.source_filename = "SRR9117853.sra"
     og.source_url = "[email protected]:/sra/sra-instant/reads/ByRun/sra/SRR/SRR9117/SRR9117853/SRR9117853.sra"
     og.is_archive = True
     og.save()
     sample = Sample()
     sample.accession_code = "SRR9117853"
     sample.save()
     assoc = OriginalFileSampleAssociation()
     assoc.sample = sample
     assoc.original_file = og
     assoc.save()
     assoc = DownloaderJobOriginalFileAssociation()
     assoc.downloader_job = dlj
     assoc.original_file = og
     assoc.save()
     result = sra._download_file(og.source_url, dlj, "/tmp/doomed", force_ftp=False)
     self.assertTrue(result)
    def test_organism_shepherd_command(self, mock_nomad, mock_send_job,
                                       mock_get_active_volumes):
        """Tests that the organism shepherd requeues jobs in the right order.

        The situation we're setting up is basically this:
          * There are two experiments.
          * One of them has 1/2 samples processed, the other 0/1
          * One of them needs a DownloaderJob requeued and the other
            needs a ProcessorJob requued.

        And what we're going to test for is:
          * Both of the jobs that need to be requeued are requeued.
          * The experiment with a processed sample is requeued first
            because it has a higher completion percentage.
        """
        # First, set up our mocks to prevent network calls.
        mock_send_job.return_value = True
        active_volumes = {"1", "2", "3"}
        mock_get_active_volumes.return_value = active_volumes

        def mock_init_nomad(host, port=0, timeout=0):
            ret_value = MagicMock()
            ret_value.jobs = MagicMock()
            ret_value.jobs.get_jobs = MagicMock()
            ret_value.jobs.get_jobs.side_effect = lambda: []
            return ret_value

        mock_nomad.side_effect = mock_init_nomad
        zebrafish = Organism(name="DANIO_RERIO",
                             taxonomy_id=1337,
                             is_scientific_name=True)
        zebrafish.save()

        # Experiment that is 0% complete.
        zero_percent_experiment = Experiment(accession_code='ERP037000')
        zero_percent_experiment.technology = 'RNA-SEQ'
        zero_percent_experiment.save()

        organism_assoc = ExperimentOrganismAssociation.objects.create(
            organism=zebrafish, experiment=zero_percent_experiment)

        zero_percent = OriginalFile()
        zero_percent.filename = "ERR037001.fastq.gz"
        zero_percent.source_filename = "ERR037001.fastq.gz"
        zero_percent.source_url = "ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR037/ERR037001/ERR037001_1.fastq.gz"
        zero_percent.is_archive = True
        zero_percent.save()

        zero_percent_sample = Sample()
        zero_percent_sample.accession_code = 'ERR037001'
        zero_percent_sample.organism = zebrafish
        zero_percent_sample.save()

        assoc = OriginalFileSampleAssociation()
        assoc.sample = zero_percent_sample
        assoc.original_file = zero_percent
        assoc.save()

        assoc = ExperimentSampleAssociation()
        assoc.sample = zero_percent_sample
        assoc.experiment = zero_percent_experiment
        assoc.save()

        # TODO: fix names of all the variables to be appropriate for this test case.
        zero_percent_dl_job = DownloaderJob()
        zero_percent_dl_job.accession_code = zero_percent_sample.accession_code
        zero_percent_dl_job.downloader_task = "SRA"
        zero_percent_dl_job.start_time = timezone.now()
        zero_percent_dl_job.end_time = timezone.now()
        zero_percent_dl_job.success = False
        zero_percent_dl_job.save()

        assoc = DownloaderJobOriginalFileAssociation()
        assoc.downloader_job = zero_percent_dl_job
        assoc.original_file = zero_percent
        assoc.save()

        # Experiment that is 50% complete.
        fify_percent_experiment = Experiment(accession_code='ERP036000')
        fify_percent_experiment.technology = 'RNA-SEQ'
        fify_percent_experiment.save()

        organism_assoc = ExperimentOrganismAssociation.objects.create(
            organism=zebrafish, experiment=fify_percent_experiment)

        ## First sample, this one has been processed.
        successful_pj = ProcessorJob()
        successful_pj.accession_code = "ERR036000"
        successful_pj.pipeline_applied = "SALMON"
        successful_pj.ram_amount = 12288
        successful_pj.start_time = timezone.now()
        successful_pj.end_time = timezone.now()
        successful_pj.success = True
        successful_pj.save()

        successful_og = OriginalFile()
        successful_og.filename = "ERR036000.fastq.gz"
        successful_og.source_filename = "ERR036000.fastq.gz"
        successful_og.source_url = "ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR036/ERR036000/ERR036000_1.fastq.gz"
        successful_og.is_archive = True
        successful_og.save()

        successful_sample = Sample()
        successful_sample.accession_code = 'ERR036000'
        successful_sample.organism = zebrafish
        successful_sample.save()

        assoc = OriginalFileSampleAssociation()
        assoc.sample = successful_sample
        assoc.original_file = successful_og
        assoc.save()

        assoc = ProcessorJobOriginalFileAssociation()
        assoc.processor_job = successful_pj
        assoc.original_file = successful_og
        assoc.save()

        assoc = ExperimentSampleAssociation()
        assoc.sample = successful_sample
        assoc.experiment = fify_percent_experiment
        assoc.save()

        ## Second sample, this one hasn't been processed.
        fifty_percent_unprocessed_og = OriginalFile()
        fifty_percent_unprocessed_og.filename = "ERR036001.fastq.gz"
        fifty_percent_unprocessed_og.source_filename = "ERR036001.fastq.gz"
        fifty_percent_unprocessed_og.source_url = "ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR036/ERR036001/ERR036001_1.fastq.gz"
        fifty_percent_unprocessed_og.is_archive = True
        fifty_percent_unprocessed_og.save()

        fifty_percent_unprocessed_sample = Sample()
        fifty_percent_unprocessed_sample.accession_code = 'ERR036001'
        fifty_percent_unprocessed_sample.organism = zebrafish
        fifty_percent_unprocessed_sample.save()

        assoc = OriginalFileSampleAssociation()
        assoc.sample = fifty_percent_unprocessed_sample
        assoc.original_file = fifty_percent_unprocessed_og
        assoc.save()

        assoc = ExperimentSampleAssociation()
        assoc.sample = fifty_percent_unprocessed_sample
        assoc.experiment = fify_percent_experiment
        assoc.save()

        fifty_percent_processor_job = ProcessorJob()
        fifty_percent_processor_job.pipeline_applied = "SALMON"
        fifty_percent_processor_job.accession_code = fifty_percent_unprocessed_sample.accession_code
        fifty_percent_processor_job.ram_amount = 12288
        fifty_percent_processor_job.start_time = timezone.now()
        fifty_percent_processor_job.end_time = timezone.now()
        fifty_percent_processor_job.success = False
        fifty_percent_processor_job.save()

        assoc = ProcessorJobOriginalFileAssociation()
        assoc.processor_job = fifty_percent_processor_job
        assoc.original_file = fifty_percent_unprocessed_og
        assoc.save()

        # Setup is done, actually run the command.
        args = []
        options = {"organism_name": "DANIO_RERIO"}
        call_command("organism_shepherd", *args, **options)

        # Verify that the jobs were called in the correct order.
        mock_calls = mock_send_job.mock_calls

        first_call_job_type = mock_calls[0][1][0]
        first_call_job_object = mock_calls[0][2]["job"]
        self.assertEqual(first_call_job_type, ProcessorPipeline.SALMON)
        self.assertEqual(first_call_job_object.pipeline_applied,
                         fifty_percent_processor_job.pipeline_applied)
        self.assertEqual(first_call_job_object.ram_amount,
                         fifty_percent_processor_job.ram_amount)
        self.assertIn(first_call_job_object.volume_index, active_volumes)

        fifty_percent_processor_job.refresh_from_db()
        self.assertEqual(first_call_job_object,
                         fifty_percent_processor_job.retried_job)

        second_call_job_type = mock_calls[1][1][0]
        second_call_job_object = mock_calls[1][2]["job"]
        self.assertEqual(second_call_job_type, Downloaders.SRA)
        self.assertEqual(second_call_job_object.accession_code,
                         zero_percent_dl_job.accession_code)
        self.assertEqual(second_call_job_object.downloader_task,
                         zero_percent_dl_job.downloader_task)

        zero_percent_dl_job.refresh_from_db()
        self.assertEqual(second_call_job_object,
                         zero_percent_dl_job.retried_job)
Beispiel #13
0
    def create_samples_from_api(self, experiment: Experiment,
                                platform_dict: Dict) -> List[Sample]:
        """Generates a Sample item for each sample in an AE experiment.

        There are many possible data situations for a sample:

            - If the sample only has raw data available:
                - If it is on a platform that we support:
                    Download this raw data and process it
                - If it is not on a platform we support:
                    Don't download anything, don't process anything
            - If the sample has both raw and derived data:
                - If the raw data is on a platform we support:
                    Download the raw data and process it, abandon the derived data
                - If the raw data is not on a platform we support
                    Download the derived data and no-op it, abandon the raw data
            - If the sample only has derived data:
                Download the derived data and no-op it.

        See an example at: https://www.ebi.ac.uk/arrayexpress/json/v3/experiments/E-MTAB-3050/samples

        """

        created_samples = []

        samples_endpoint = SAMPLES_URL.format(experiment.accession_code)
        r = utils.requests_retry_session().get(samples_endpoint, timeout=60)
        samples = r.json()["experiment"]["sample"]

        # The SDRF is the complete metadata record on a sample/property basis.
        # We run this through our harmonizer and then attach the properties
        # to our created samples.
        SDRF_URL_TEMPLATE = "https://www.ebi.ac.uk/arrayexpress/files/{code}/{code}.sdrf.txt"
        sdrf_url = SDRF_URL_TEMPLATE.format(code=experiment.accession_code)
        sdrf_samples = harmony.parse_sdrf(sdrf_url)
        harmonized_samples = harmony.harmonize(sdrf_samples)

        # An experiment can have many samples
        for sample_data in samples:

            # For some reason, this sample has no files associated with it.
            if "file" not in sample_data or len(sample_data['file']) == 0:
                continue

            # Each sample is given an experimenatlly-unique title.
            flat_sample = utils.flatten(sample_data)
            title = harmony.extract_title(flat_sample)

            # A sample may actually have many sub files.
            # If there is raw data, take that.
            # If not, take the derived.
            has_raw = False
            for sub_file in sample_data['file']:

                # For ex: E-GEOD-15645
                if isinstance(sub_file['comment'], list):
                    sub_file_mod = sub_file
                    sub_file_mod['comment'] = sub_file['comment'][0]
                else:
                    sub_file_mod = sub_file

                # Some have the 'data' field, but not the actual data
                # Ex: E-GEOD-9656
                if sub_file_mod['type'] == "data" and sub_file_mod[
                        'comment'].get('value', None) != None:
                    has_raw = True
                if 'raw' in sub_file_mod['comment'].get('value', ''):
                    has_raw = True

            skip_sample = False
            for sub_file in sample_data['file']:

                # Don't get the raw data if it's only a 1-color sample.
                if 'Cy3' in str(sample_data) and 'Cy5' not in str(sample_data):
                    has_raw = False

                # Skip derived data if we have it raw.
                if has_raw and "derived data" in sub_file['type']:
                    continue

                download_url = None
                filename = sub_file["name"]

                # sub_file["comment"] is only a list if there's
                # more than one comment...
                comments = sub_file["comment"]
                if isinstance(comments, list):
                    # Could be: "Derived ArrayExpress Data Matrix FTP
                    # file" or: "ArrayExpress FTP file". If there is
                    # no comment with a name including "FTP file" then
                    # we don't know where to download it so we need to
                    # mark this job as an error. Therefore don't catch
                    # the potential exception where download_url
                    # doesn't get defined.
                    for comment in comments:
                        if "FTP file" in comment["name"]:
                            download_url = comment["value"]
                            break
                else:
                    download_url = comments["value"]

                if not download_url:
                    logger.error(
                        "Sample %s did not specify a download url, skipping.",
                        sample_accession_code,
                        experiment_accession_code=experiment.accession_code,
                        survey_job=self.survey_job.id,
                        sub_file=sub_file)
                    skip_sample = True
                    continue

                if not filename:
                    logger.error(
                        "Sample %s did not specify a filename, skipping.",
                        sample_accession_code,
                        experiment_accession_code=experiment.accession_code,
                        survey_job=self.survey_job.id,
                        sub_file=sub_file)
                    skip_sample = True
                    continue

            if skip_sample:
                continue

            # The accession code is not a simple matter to determine.
            sample_source_name = sample_data["source"].get("name", "")
            sample_assay_name = sample_data["assay"].get("name", "")
            sample_accession_code = self.determine_sample_accession(
                experiment.accession_code, sample_source_name,
                sample_assay_name, filename)

            # Figure out the Organism for this sample
            organism_name = UNKNOWN
            for characteristic in sample_data["characteristic"]:
                if characteristic["category"].upper() == "ORGANISM":
                    organism_name = characteristic["value"].upper()

            if organism_name == UNKNOWN:
                logger.error(
                    "Sample %s did not specify the organism name.",
                    sample_accession_code,
                    experiment_accession_code=experiment.accession_code,
                    survey_job=self.survey_job.id)
                organism = None
                continue
            else:
                organism = Organism.get_object_for_name(organism_name)

            # Create the sample object
            try:
                # Associate it with the experiment, but since it
                # already exists it already has original files
                # associated with it and it's already been downloaded,
                # so don't add it to created_samples.
                sample_object = Sample.objects.get(
                    accession_code=sample_accession_code)

                # If input experiment includes new protocol information,
                # update sample's protocol_info.
                existing_protocols = sample_object.protocol_info
                protocol_info, is_updated = self.update_sample_protocol_info(
                    existing_protocols, experiment.protocol_description,
                    experiment.source_url + '/protocols')
                if is_updated:
                    sample_object.protocol_info = protocol_info
                    sample_obejct.save()

                logger.debug(
                    "Sample %s already exists, skipping object creation.",
                    sample_accession_code,
                    experiment_accession_code=experiment.accession_code,
                    survey_job=self.survey_job.id)
            except Sample.DoesNotExist:
                sample_object = Sample()

                # The basics
                sample_object.source_database = "ARRAY_EXPRESS"
                sample_object.title = title
                sample_object.accession_code = sample_accession_code
                sample_object.source_archive_url = samples_endpoint
                sample_object.organism = organism
                sample_object.platform_name = platform_dict[
                    "platform_accession_name"]
                sample_object.platform_accession_code = platform_dict[
                    "platform_accession_code"]
                sample_object.manufacturer = platform_dict["manufacturer"]
                sample_object.technology = "MICROARRAY"

                protocol_info, is_updated = self.update_sample_protocol_info(
                    existing_protocols=[],
                    experiment_protocol=experiment.protocol_description,
                    protocol_url=experiment.source_url + '/protocols')
                # Do not check is_updated the first time because we must
                # save a list so we can append to it later.
                sample_object.protocol_info = protocol_info

                sample_object.save()

                # Directly assign the harmonized properties
                harmonized_sample = harmonized_samples[title]
                for key, value in harmonized_sample.items():
                    setattr(sample_object, key, value)
                sample_object.save()

                sample_annotation = SampleAnnotation()
                sample_annotation.data = sample_data
                sample_annotation.sample = sample_object
                sample_annotation.is_ccdl = False
                sample_annotation.save()

                original_file = OriginalFile()
                original_file.filename = filename
                original_file.source_filename = filename
                original_file.source_url = download_url
                original_file.is_downloaded = False
                original_file.is_archive = True
                original_file.has_raw = has_raw
                original_file.save()

                original_file_sample_association = OriginalFileSampleAssociation(
                )
                original_file_sample_association.original_file = original_file
                original_file_sample_association.sample = sample_object
                original_file_sample_association.save()

                created_samples.append(sample_object)

                logger.debug(
                    "Created " + str(sample_object),
                    experiment_accession_code=experiment.accession_code,
                    survey_job=self.survey_job.id,
                    sample=sample_object.id)

            # Create associations if they don't already exist
            ExperimentSampleAssociation.objects.get_or_create(
                experiment=experiment, sample=sample_object)

            ExperimentOrganismAssociation.objects.get_or_create(
                experiment=experiment, organism=organism)

        return created_samples
Beispiel #14
0
    def test_create_missing_jobs(self):
        """Tests that files which should have processor jobs get them created.

        Specifically files that fall into this category are files that
        had successful downloader jobs but for some reason do not have
        processor jobs. It's not yet known why this is happening, but
        part of this management command is logging about them to get a
        grasp of how many there are.

        We want this test to cover both Microarray and RNA-Seq. We
        also need to test both that files which need processor jobs
        have them created, but also that files which don't need them
        don't get them created.

        Therefore we need at least 4 original files:
          * Microarray needing processor job.
          * Microarray not needing processor job.
          * RNA-Seq needing processor job.
          * RNA-Seq not needing processor job.

        However Microarray can have files which shouldn't get
        processor jobs, so we're going to make one of those as
        well. Also Microarray jobs can download multiple files which
        get a processor job each, so we're going to make an additional
        Microarray file and associate it with the same downloader job
        so we can make sure two processor jobs are created based on
        that one downloader job.
        """
        # Microarray File/Samples/Jobs
        ma_og_doesnt_need_processor = OriginalFile()
        ma_og_doesnt_need_processor.filename = "processed.CEL"
        ma_og_doesnt_need_processor.is_downloaded = True
        ma_og_doesnt_need_processor.is_archive = False
        ma_og_doesnt_need_processor.save()

        ma_sample_doesnt_need_processor = Sample()
        ma_sample_doesnt_need_processor.accession_code = "MA_doesnt_need_processor"
        ma_sample_doesnt_need_processor.save()

        OriginalFileSampleAssociation.objects.get_or_create(
            sample=ma_sample_doesnt_need_processor,
            original_file=ma_og_doesnt_need_processor)

        ma_dl_job_doesnt_need_processor = DownloaderJob()
        ma_dl_job_doesnt_need_processor.success = True
        ma_dl_job_doesnt_need_processor.worker_id = "worker_1"
        ma_dl_job_doesnt_need_processor.volume_index = "1"
        ma_dl_job_doesnt_need_processor.save()

        DownloaderJobOriginalFileAssociation.objects.get_or_create(
            downloader_job=ma_dl_job_doesnt_need_processor,
            original_file=ma_og_doesnt_need_processor)

        ma_processor_job = ProcessorJob()
        ma_processor_job.success = True
        ma_processor_job.worker_id = "worker_1"
        ma_dl_job_doesnt_need_processor.volume_index = "1"
        ma_processor_job.save()

        ProcessorJobOriginalFileAssociation.objects.get_or_create(
            processor_job=ma_processor_job,
            original_file=ma_og_doesnt_need_processor)

        ma_og_needs_processor_1 = OriginalFile()
        ma_og_needs_processor_1.filename = "something.CEL"
        ma_og_needs_processor_1.is_downloaded = True
        ma_og_needs_processor_1.is_archive = False
        ma_og_needs_processor_1.save()

        ma_og_needs_processor_2 = OriginalFile()
        ma_og_needs_processor_2.filename = "something_else.CEL"
        ma_og_needs_processor_2.is_downloaded = True
        ma_og_needs_processor_2.is_archive = False
        ma_og_needs_processor_2.save()

        ma_og_archive = OriginalFile()
        ma_og_archive.filename = "tarball.gz"
        ma_og_archive.is_downloaded = True
        ma_og_archive.is_archive = True
        ma_og_archive.save()

        ma_sample_needs_processor_1 = Sample()
        ma_sample_needs_processor_1.accession_code = "MA_needs_processor_1"
        ma_sample_needs_processor_1.save()

        OriginalFileSampleAssociation.objects.get_or_create(
            sample=ma_sample_needs_processor_1,
            original_file=ma_og_needs_processor_1)
        OriginalFileSampleAssociation.objects.get_or_create(
            sample=ma_sample_needs_processor_1, original_file=ma_og_archive)

        ma_sample_needs_processor_2 = Sample()
        ma_sample_needs_processor_2.accession_code = "MA_needs_processor_2"
        ma_sample_needs_processor_2.save()

        OriginalFileSampleAssociation.objects.get_or_create(
            sample=ma_sample_needs_processor_2,
            original_file=ma_og_needs_processor_2)
        OriginalFileSampleAssociation.objects.get_or_create(
            sample=ma_sample_needs_processor_2, original_file=ma_og_archive)

        ma_dl_job_needs_processor = DownloaderJob()
        ma_dl_job_needs_processor.success = True
        ma_dl_job_needs_processor.worker_id = "worker_1"
        ma_dl_job_doesnt_need_processor.volume_index = "1"
        ma_dl_job_needs_processor.save()

        DownloaderJobOriginalFileAssociation.objects.get_or_create(
            downloader_job=ma_dl_job_needs_processor,
            original_file=ma_og_needs_processor_1)
        DownloaderJobOriginalFileAssociation.objects.get_or_create(
            downloader_job=ma_dl_job_needs_processor,
            original_file=ma_og_needs_processor_2)
        DownloaderJobOriginalFileAssociation.objects.get_or_create(
            downloader_job=ma_dl_job_needs_processor,
            original_file=ma_og_archive)

        # RNA-Seq File/Samples/Jobs
        rna_og_doesnt_need_processor = OriginalFile()
        rna_og_doesnt_need_processor.filename = "processed.fastq"
        rna_og_doesnt_need_processor.is_downloaded = True
        rna_og_doesnt_need_processor.is_archive = False
        rna_og_doesnt_need_processor.save()

        rna_sample_doesnt_need_processor = Sample()
        rna_sample_doesnt_need_processor.accession_code = "RNA_doesnt_need_processor"
        rna_sample_doesnt_need_processor.save()

        OriginalFileSampleAssociation.objects.get_or_create(
            sample=rna_sample_doesnt_need_processor,
            original_file=rna_og_doesnt_need_processor)

        rna_dl_job_doesnt_need_processor = DownloaderJob()
        rna_dl_job_doesnt_need_processor.success = True
        rna_dl_job_doesnt_need_processor.worker_id = "worker_1"
        rna_dl_job_doesnt_need_processor.volume_index = "1"
        rna_dl_job_doesnt_need_processor.save()

        DownloaderJobOriginalFileAssociation.objects.get_or_create(
            downloader_job=rna_dl_job_doesnt_need_processor,
            original_file=rna_og_doesnt_need_processor)

        rna_processor_job = ProcessorJob()
        # Failed ProcessorJobs will be retried, so they still count.
        rna_processor_job.success = False
        rna_processor_job.worker_id = "worker_1"
        rna_dl_job_doesnt_need_processor.volume_index = "1"
        rna_processor_job.save()

        ProcessorJobOriginalFileAssociation.objects.get_or_create(
            processor_job=rna_processor_job,
            original_file=rna_og_doesnt_need_processor)

        rna_og_needs_processor = OriginalFile()
        rna_og_needs_processor.filename = "something.fastq"
        rna_og_needs_processor.is_downloaded = True
        rna_og_needs_processor.is_archive = False
        rna_og_needs_processor.save()

        rna_sample_needs_processor = Sample()
        rna_sample_needs_processor.accession_code = "RNA_needs_processor"
        rna_sample_needs_processor.save()

        OriginalFileSampleAssociation.objects.get_or_create(
            sample=rna_sample_needs_processor,
            original_file=rna_og_needs_processor)

        rna_dl_job_needs_processor = DownloaderJob()
        rna_dl_job_needs_processor.success = True
        rna_dl_job_needs_processor.worker_id = "worker_1"
        rna_dl_job_doesnt_need_processor.volume_index = "1"
        rna_dl_job_needs_processor.save()

        DownloaderJobOriginalFileAssociation.objects.get_or_create(
            downloader_job=rna_dl_job_needs_processor,
            original_file=rna_og_needs_processor)

        # Setup is done, actually run the command.
        command = Command()
        command.handle()

        # Test Microarray was handled correctly.
        ## Test that a missing processor job was created.
        self.assertEqual(
            1,
            ProcessorJobOriginalFileAssociation.objects.filter(
                original_file=ma_og_needs_processor_1).count())
        self.assertEqual(
            1,
            ProcessorJobOriginalFileAssociation.objects.filter(
                original_file=ma_og_needs_processor_2).count())
        self.assertEqual(
            0,
            ProcessorJobOriginalFileAssociation.objects.filter(
                original_file=ma_og_archive).count())

        ## Test that a processor job that wasn't missing wasn't created.
        ## Of course, we created one in test setup, so we're really
        ## checking that it's still only 1.
        self.assertEqual(
            1,
            ProcessorJobOriginalFileAssociation.objects.filter(
                original_file=ma_og_doesnt_need_processor).count())

        # Test Microarray was handled correctly.
        ## Test that the missing processor job was created.
        self.assertEqual(
            1,
            ProcessorJobOriginalFileAssociation.objects.filter(
                original_file=rna_og_needs_processor).count())

        ## Test that a processor job that wasn't missing wasn't created.
        ## Of course, we created one in test setup, so we're really
        ## checking that it's still only 1.
        self.assertEqual(
            1,
            ProcessorJobOriginalFileAssociation.objects.filter(
                original_file=rna_og_doesnt_need_processor).count())
Beispiel #15
0
    def test_create_missing_jobs(self):
        """Tests that files which should have downloader jobs get them created."""

        # 1. create a sample with an original file and a downloader job
        original_file_with_downloader = OriginalFile()
        original_file_with_downloader.filename = "processed.CEL"
        original_file_with_downloader.source_filename = "processed.CEL"
        original_file_with_downloader.is_downloaded = True
        original_file_with_downloader.is_archive = False
        original_file_with_downloader.save()

        sample_with_downloader = Sample()
        sample_with_downloader.accession_code = "MA_doesnt_need_processor"
        sample_with_downloader.technology = "MICROARRAY"
        sample_with_downloader.source_database = "GEO"
        sample_with_downloader.platform_accession_code = "bovine"
        sample_with_downloader.save()

        OriginalFileSampleAssociation.objects.get_or_create(
            sample=sample_with_downloader,
            original_file=original_file_with_downloader)

        downloader_job = DownloaderJob()
        downloader_job.success = True
        downloader_job.worker_id = "worker_1"
        downloader_job.volume_index = "1"
        downloader_job.save()

        DownloaderJobOriginalFileAssociation.objects.get_or_create(
            downloader_job=downloader_job,
            original_file=original_file_with_downloader)

        # 2. create a sample with an original file and no downloader job
        original_file = OriginalFile()
        original_file.filename = "tarball.gz"
        original_file.source_filename = "tarball.gz"
        original_file.is_downloaded = True
        original_file.is_archive = True
        original_file.save()

        sample_no_downloader = Sample()
        sample_no_downloader.accession_code = "sample_no_downloader"
        sample_no_downloader.technology = "MICROARRAY"
        sample_no_downloader.source_database = "GEO"
        sample_no_downloader.platform_accession_code = "bovine"  # must be a supported platform
        sample_no_downloader.save()

        OriginalFileSampleAssociation.objects.get_or_create(
            sample=sample_no_downloader, original_file=original_file)

        # 3. Setup is done, actually run the command.
        command = Command()
        command.handle()

        ## Test that a missing downloader job was created.
        self.assertEqual(
            1,
            DownloaderJobOriginalFileAssociation.objects.filter(
                original_file=original_file).count(),
        )

        ## Test that a downloader job that wasn't missing wasn't created.
        ## Of course, we created one in test setup, so we're really
        ## checking that it's still only 1.
        self.assertEqual(
            1,
            DownloaderJobOriginalFileAssociation.objects.filter(
                original_file=original_file_with_downloader).count(),
        )
Beispiel #16
0
def download_geo(job_id: int) -> None:
    """The main function for the GEO Downloader.

    Downloads a single tar file containing the files representing
    samples relating to a single experiment stored in
    GEO.
    """
    job = utils.start_job(job_id)
    accession_code = job.accession_code
    original_file = job.original_files.first()

    if not original_file:
        job.failure_reason = "No files associated with the job."
        logger.error("No files associated with the job.", downloader_job=job_id)
        utils.end_downloader_job(job, success=False)
        return

    url = original_file.source_url
    related_samples = original_file.samples.exclude(technology="RNA-SEQ")

    # First, download the sample archive URL.
    # Then, unpack all the ones downloaded.
    # Then create processor jobs!

    # The files for all of the samples are contained within the same zip file. Therefore only
    # download the one.
    os.makedirs(LOCAL_ROOT_DIR + "/" + accession_code, exist_ok=True)
    dl_file_path = LOCAL_ROOT_DIR + "/" + accession_code + "/" + url.split("/")[-1]

    logger.debug("Starting to download: " + url, job_id=job_id, accession_code=accession_code)
    _download_file(url, dl_file_path, job)
    original_file.absolute_file_path = dl_file_path
    original_file.is_downloaded = True
    original_file.save()

    unpacked_sample_files = []

    try:
        # enumerate all files inside the archive
        archived_files = list(ArchivedFile(dl_file_path).get_files())
    except FileExtractionError as e:
        job.failure_reason = e
        logger.exception(
            "Error occurred while extracting file.", path=dl_file_path, exception=str(e)
        )
        utils.end_downloader_job(job, success=False)
        return

    for og_file in archived_files:
        sample = og_file.get_sample()

        # We don't want RNA-Seq data from GEO:
        # https://github.com/AlexsLemonade/refinebio/issues/966
        if sample and sample.technology == "RNA-SEQ":
            logger.warn("RNA-Seq sample found in GEO downloader job.", sample=sample)
            continue

        if not sample and (
            not og_file.is_processable() or og_file.experiment_accession_code() != accession_code
        ):
            # skip the files that we know are not processable and can't be associated with a sample
            # also skip the files were we couldn't find a sample and they don't mention the current experiment
            continue

        potential_existing_file = OriginalFile.objects.filter(
            source_filename=original_file.source_filename,
            filename=og_file.filename,
            is_archive=False,
        ).first()
        if potential_existing_file:
            # We've already created this record, let's see if we actually
            # needed to download it or if we just got it because we needed
            # a file in the same archive.
            if potential_existing_file.needs_processing():
                if not potential_existing_file.is_downloaded:
                    potential_existing_file.is_downloaded = True
                    potential_existing_file.save()

                unpacked_sample_files.append(potential_existing_file)
            continue

        # Then this is a new file and we should create an original file for it
        actual_file = OriginalFile()
        actual_file.is_downloaded = True
        actual_file.is_archive = False
        actual_file.absolute_file_path = og_file.file_path
        actual_file.filename = og_file.filename
        actual_file.calculate_size()
        actual_file.calculate_sha1()
        actual_file.has_raw = True
        actual_file.source_url = original_file.source_url
        actual_file.source_filename = original_file.source_filename
        actual_file.save()

        # try to see if the file should be associated with a sample
        if sample:
            original_file_sample_association = OriginalFileSampleAssociation()
            original_file_sample_association.sample = sample
            original_file_sample_association.original_file = actual_file
            original_file_sample_association.save()
        else:
            # if not, we can associate this file with all samples in the experiment
            for sample in related_samples:
                original_file_sample_association = OriginalFileSampleAssociation()
                original_file_sample_association.sample = sample
                original_file_sample_association.original_file = actual_file
                original_file_sample_association.save()

        unpacked_sample_files.append(actual_file)

    if len(unpacked_sample_files) > 0:
        success = True
        logger.debug(
            "File downloaded and extracted successfully.",
            url=url,
            dl_file_path=dl_file_path,
            downloader_job=job_id,
        )
    else:
        success = False
        logger.info(
            "Unable to extract any files.",
            url=url,
            dl_file_path=dl_file_path,
            downloader_job=job_id,
        )
        job.failure_reason = "Failed to extract any downloaded files."

    if success:
        create_processor_jobs_for_original_files(unpacked_sample_files, job)

    if original_file.is_archive:
        original_file.delete_local_file()

    utils.end_downloader_job(job, success)

    return success
Beispiel #17
0
def download_geo(job_id: int) -> None:
    """The main function for the GEO Downloader.

    Downloads a single tar file containing the files representing
    samples relating to a single experiement stored in
    GEO.
    """
    job = utils.start_job(job_id)

    file_assocs = DownloaderJobOriginalFileAssociation.objects.filter(
        downloader_job=job)

    original_file = file_assocs[0].original_file
    url = original_file.source_url
    accession_code = job.accession_code

    sample_assocs = OriginalFileSampleAssociation.objects.filter(
        original_file=original_file)
    related_samples = Sample.objects.filter(
        id__in=sample_assocs.values('sample_id'))

    # First, download the sample archive URL.
    # Then, unpack all the ones downloaded.
    # Then create processor jobs!

    # The files for all of the samples are
    # contained within the same zip file. Therefore only
    # download the one.
    os.makedirs(LOCAL_ROOT_DIR + '/' + accession_code, exist_ok=True)
    dl_file_path = LOCAL_ROOT_DIR + '/' + accession_code + '/' + url.split(
        '/')[-1]

    logger.debug("Starting to download: " + url,
                 job_id=job_id,
                 accession_code=accession_code)
    _download_file(url, dl_file_path, job)
    original_file.absolute_file_path = dl_file_path
    original_file.is_downloaded = True
    original_file.save()

    has_raw = True
    unpacked_sample_files = []

    # These files are tarred, and also subsequently gzipped
    if '.tar' in dl_file_path:
        try:
            extracted_files = _extract_tar(dl_file_path, accession_code)
        except Exception as e:
            job.failure_reason = e
            logger.exception("Error occured while extracting tar file.",
                             path=dl_file_path,
                             exception=str(e))
            utils.end_downloader_job(job, success=False)
            return

        for og_file in extracted_files:

            filename = og_file['filename']
            if '_' in filename:
                sample_id = filename.split('_')[0]
            else:
                sample_id = filename.split('.')[0]

            try:
                sample = Sample.objects.get(accession_code=sample_id)
            except Exception as e:
                # We don't have this sample, but it's not a total failure. This happens.
                continue

            try:
                # Files from the GEO supplemental file are gzipped inside of the tarball. Great!
                archive_file = OriginalFile.objects.get(
                    source_filename__contains=sample_id)
                archive_file.is_downloaded = True
                archive_file.is_archive = True
                archive_file.absolute_file_path = og_file['absolute_path']
                archive_file.calculate_size()
                archive_file.calculate_sha1()
                archive_file.save()

                if '.gz' in og_file['filename']:
                    extracted_subfile = _extract_gz(og_file['absolute_path'],
                                                    accession_code)
                else:
                    extracted_subfile = [og_file]

                actual_file = OriginalFile()
                actual_file.is_downloaded = True
                actual_file.is_archive = False
                actual_file.absolute_file_path = extracted_subfile[0][
                    'absolute_path']
                actual_file.filename = extracted_subfile[0]['filename']
                actual_file.calculate_size()
                actual_file.calculate_sha1()
                actual_file.has_raw = True
                actual_file.source_url = original_file.source_url
                actual_file.source_filename = original_file.source_filename
                actual_file.save()

                original_file_sample_association = OriginalFileSampleAssociation(
                )
                original_file_sample_association.sample = sample
                original_file_sample_association.original_file = actual_file
                original_file_sample_association.save()

                archive_file.delete_local_file()
                archive_file.is_downloaded = False
                archive_file.save()

                unpacked_sample_files.append(actual_file)
            except Exception as e:
                # TODO - is this worth failing a job for?
                logger.debug(
                    "Found a file we didn't have an OriginalFile for! Why did this happen?: "
                    + og_file['filename'],
                    exc_info=1,
                    file=og_file['filename'],
                    sample_id=sample_id,
                    accession_code=accession_code)
                # If we don't know why we have it, get rid of it.
                os.remove(og_file["absolute_path"])

    # This is a .tgz file.
    elif '.tgz' in dl_file_path:
        # If this is the MINiML file, it has been preprocessed
        if '_family.xml.tgz' in dl_file_path:
            has_raw = False

        try:
            extracted_files = _extract_tgz(dl_file_path, accession_code)
        except Exception as e:
            job.failure_reason = e
            logger.exception("Error occured while extracting tgz file.",
                             path=dl_file_path,
                             exception=str(e))
            utils.end_downloader_job(job, success=False)
            return

        for og_file in extracted_files:

            if '.txt' in og_file['filename']:
                try:
                    gsm_id = og_file['filename'].split('-')[0]
                    sample = Sample.objects.get(accession_code=gsm_id)
                except Exception as e:
                    os.remove(og_file["absolute_path"])
                    continue

                actual_file = OriginalFile()
                actual_file.is_downloaded = True
                actual_file.is_archive = False
                actual_file.absolute_file_path = og_file['absolute_path']
                actual_file.filename = og_file['filename']
                actual_file.calculate_size()
                actual_file.calculate_sha1()
                actual_file.has_raw = has_raw
                actual_file.source_url = original_file.source_url
                actual_file.source_filename = original_file.source_filename
                actual_file.save()

                original_file_sample_association = OriginalFileSampleAssociation(
                )
                original_file_sample_association.sample = sample
                original_file_sample_association.original_file = actual_file
                original_file_sample_association.save()

                unpacked_sample_files.append(actual_file)

    # These files are only gzipped.
    # These are generally the _actually_ raw (rather than the non-raw data in a RAW file) data
    elif '.gz' in dl_file_path:
        try:
            extracted_files = _extract_gz(dl_file_path, accession_code)
        except Exception as e:
            job.failure_reason = e
            logger.exception("Error occured while extracting gz file.",
                             path=dl_file_path,
                             exception=str(e))
            utils.end_downloader_job(job, success=False)
            return

        for og_file in extracted_files:

            filename = og_file['filename']
            sample_id = filename.split('.')[0]

            try:
                # The archive we downloaded
                archive_file = OriginalFile.objects.get(
                    source_filename__contains=filename)
                archive_file.is_downloaded = True
                archive_file.is_archive = True
                archive_file.absolute_file_path = dl_file_path
                archive_file.calculate_size()
                archive_file.calculate_sha1()
                archive_file.save()

                actual_file = OriginalFile()
                actual_file.is_downloaded = True
                actual_file.is_archive = False
                actual_file.absolute_file_path = og_file['absolute_path']
                actual_file.filename = og_file['filename']
                actual_file.calculate_size()
                actual_file.calculate_sha1()
                actual_file.has_raw = True
                actual_file.source_url = original_file.source_url
                actual_file.source_filename = original_file.source_filename
                actual_file.save()

                for sample in related_samples:
                    new_association = OriginalFileSampleAssociation()
                    new_association.original_file = actual_file
                    new_association.sample = sample
                    new_association.save()

                archive_file.delete_local_file()
                archive_file.is_downloaded = False
                archive_file.save()

                unpacked_sample_files.append(actual_file)
            except Exception as e:
                logger.debug(
                    "Found a file we didn't have an OriginalFile for! Why did this happen?: "
                    + og_file['filename'],
                    exc_info=1,
                    file=og_file['filename'],
                    sample_id=sample_id,
                    accession_code=accession_code)
                os.remove(og_file["absolute_path"])

    # This is probably just a .txt file
    else:
        filename = dl_file_path.split('/')[-1]
        sample_id = filename.split('_')[0]

        actual_file = OriginalFile()
        actual_file.is_downloaded = True
        actual_file.is_archive = False
        actual_file.absolute_file_path = dl_file_path
        actual_file.filename = filename
        actual_file.calculate_size()
        actual_file.calculate_sha1()
        actual_file.has_raw = True
        actual_file.source_url = original_file.source_url
        actual_file.source_filename = original_file.source_filename
        actual_file.save()

        for sample in related_samples:
            new_association = OriginalFileSampleAssociation()
            new_association.original_file = actual_file
            new_association.sample = sample
            new_association.save()

        unpacked_sample_files.append(actual_file)

    if len(unpacked_sample_files) > 0:
        success = True
        logger.debug("File downloaded and extracted successfully.",
                     url=url,
                     dl_file_path=dl_file_path,
                     downloader_job=job_id)
    else:
        success = False
        logger.info("Unable to extract any files.",
                    url=url,
                    dl_file_path=dl_file_path,
                    downloader_job=job_id)
        job.failure_reason = "Failed to extract any downloaded files."

    if success:
        utils.create_processor_jobs_for_original_files(unpacked_sample_files,
                                                       job)

    if original_file.is_archive:
        original_file.delete_local_file()

    utils.end_downloader_job(job, success)

    return success