Exemple #1
0
    def setUpClass(cls):
        survey_job = SurveyJob(
            source_type="ARRAY_EXPRESS"
        )
        survey_job.save()

        batch = Batch(
            survey_job=survey_job,
            source_type="ARRAY_EXPRESS",
            pipeline_required="AFFY_TO_PCL",
            platform_accession_code="A-AFFY-141",
            experiment_accession_code="E-GEOD-59071",
            experiment_title="It doesn't really matter.",
            organism_id=9606,
            organism_name="H**O SAPIENS",
            release_date="2017-05-05",
            last_uploaded_date="2017-05-05",
            status=BatchStatuses.NEW.value
        )
        batch.save()

        file = File(
            size_in_bytes=0,
            download_url="ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/GEOD/E-GEOD-59071/E-GEOD-59071.raw.3.zip",  # noqa
            raw_format="CEL",
            processed_format="PCL",
            name="GSM1426072.CEL",
            internal_location="A-AFFY-141/AFFY_TO_PCL",
            batch=batch
        )
        file.save()

        super(FilesTestCase, cls).setUpClass()
def init_objects():
    survey_job = SurveyJob(source_type="ARRAY_EXPRESS")
    survey_job.save()

    batch = Batch(
        survey_job=survey_job,
        source_type="ARRAY_EXPRESS",
        pipeline_required="AFFY_TO_PCL",
        platform_accession_code="A-AFFY-1",
        experiment_accession_code="E-MTAB-3050",
        experiment_title="It doesn't really matter.",
        organism_id=9606,
        organism_name="H**O SAPIENS",
        release_date="2017-05-05",
        last_uploaded_date="2017-05-05",
        status=BatchStatuses.DOWNLOADED.value
    )
    batch.save()

    file = File(size_in_bytes=0,
                download_url="ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/GEOD/E-GEOD-59071/E-GEOD-59071.raw.3.zip",  # noqa
                raw_format="CEL",
                processed_format="PCL",
                name="CE1234.CEL",
                internal_location="A-AFFY-1/AFFY_TO_PCL/",
                batch=batch)
    file.save()

    batch.files = [file]
    return batch
Exemple #3
0
    def handle(self, *args, **options):
        # Create all the dummy data that would have been created
        # before a downloader job could have been generated.
        survey_job = SurveyJob(source_type="ARRAY_EXPRESS")
        survey_job.save()

        batch = Batch(survey_job=survey_job,
                      source_type="ARRAY_EXPRESS",
                      pipeline_required="AFFY_TO_PCL",
                      platform_accession_code="A-AFFY-141",
                      experiment_accession_code="E-GEOD-59071",
                      experiment_title="It doesn't really matter.",
                      organism_id=9606,
                      organism_name="H**O SAPIENS",
                      release_date="2017-05-05",
                      last_uploaded_date="2017-05-05",
                      status=BatchStatuses.NEW.value)
        batch.save()

        file = File(
            batch=batch,
            size_in_bytes=0,
            download_url=
            "ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/GEOD/E-GEOD-59071/E-GEOD-59071.raw.3.zip",  # noqa
            raw_format="CEL",
            processed_format="PCL",
            name="GSM1426072_CD_colon_active_2.CEL",
            internal_location="A-AFFY-141/AFFY_TO_PCL")
        file.save()

        downloader_job = DownloaderJob.create_job_and_relationships(
            batches=[batch])
        send_job(Downloaders["ARRAY_EXPRESS"], downloader_job.id)
Exemple #4
0
def _insert_salmon_index():
    """Creates a batch for the index for the organism for the test."""
    survey_job = SurveyJob(source_type="TRANSCRIPTOME_INDEX")
    survey_job.save()

    batch = Batch(survey_job=survey_job,
                  source_type="TRANSCRIPTOME_INDEX",
                  pipeline_required="TRANSCRIPTOME_INDEX",
                  platform_accession_code="TEST",
                  experiment_accession_code="HOMO_SAPIENS",
                  experiment_title="It doesn't really matter.",
                  organism_id=9606,
                  organism_name="H**O SAPIENS",
                  release_date="2017-11-02",
                  last_uploaded_date="2017-11-02",
                  status=BatchStatuses.PROCESSED.value)
    batch.save()

    kmer_size = BatchKeyValue(key="kmer_size", value="23", batch=batch)
    kmer_size.save()

    index_file = File(
        size_in_bytes=2214725074,
        raw_format="gtf.gz",
        processed_format="tar.gz",
        name="Homo_sapiens_short.gtf.gz",
        internal_location="TEST/TRANSCRIPTOME_INDEX",
        download_url=("ftp://ftp.ensembl.org/pub/release-90/gtf/homo_sapiens"
                      "/Homo_sapiens.GRCh38.90.gtf.gz"),
        batch=batch)
    index_file.save()
    def test_good_file_grouping(self):
        """Returns None if both files have the same download_url."""
        downloader_job = DownloaderJob.create_job_and_relationships(
            batches=[], downloader_task="dummy")

        self.assertIsNone(
            transcriptome_index._verify_files(File(download_url="a"),
                                              File(download_url="a"),
                                              downloader_job))
    def test_bad_file_grouping(self):
        """Raises exception if both files don't have the same download_url."""
        downloader_job = DownloaderJob.create_job_and_relationships(
            batches=[], downloader_task="dummy")

        with self.assertRaises(ValueError):
            self.assertIsNone(
                transcriptome_index._verify_files(File(download_url="a"),
                                                  File(download_url="b"),
                                                  downloader_job))
Exemple #7
0
    def test_survey(self):
        survey_job = SurveyJob(source_type="ARRAY_EXPRESS")
        surveyor = ArrayExpressSurveyor(survey_job)
        file1 = File(download_url="a")
        file2 = File(download_url="a")
        file3 = File(download_url="b")
        file4 = File(download_url="a")
        batch1 = Batch(files=[file1])
        batch2 = Batch(files=[file2])
        batch3 = Batch(files=[file3, file4])

        surveyor.batches = [batch1, batch2, batch3]
        groups = surveyor.group_batches()
        self.assertEqual(groups, [[batch1, batch2], [batch3]])
Exemple #8
0
def get_batch():
    survey_job = SurveyJob(source_type="ARRAY_EXPRESS")
    survey_job.save()

    batch = Batch(survey_job=survey_job,
                  source_type="ARRAY_EXPRESS",
                  pipeline_required="AFFY_TO_PCL",
                  platform_accession_code="A-AFFY-1",
                  experiment_accession_code="E-MTAB-3050",
                  experiment_title="It doesn't really matter.",
                  organism_id=9606,
                  organism_name="H**O SAPIENS",
                  release_date="2017-05-05",
                  last_uploaded_date="2017-05-05",
                  status=BatchStatuses.NEW.value)
    batch.save()

    File(size_in_bytes=0,
         download_url="example.com",
         raw_format="CEL",
         processed_format="PCL",
         name="CE1234.CEL",
         internal_location="A-AFFY-1/AFFY_TO_PCL/",
         batch=batch).save()
    return batch
Exemple #9
0
    def test_aspera_downloader(self):
        """ """

        batch = Batch(survey_job=self.survey_job,
                      source_type="SRA",
                      pipeline_required="SALMON",
                      platform_accession_code="IlluminaHiSeq2000",
                      experiment_accession_code="DRX001563",
                      experiment_title="It doesn't really matter.",
                      organism_id=9031,
                      organism_name="GALLUS GALLUS",
                      release_date="2013-07-19",
                      last_uploaded_date="2017-09-11",
                      status=BatchStatuses.NEW.value)
        batch.save()

        # This is converted from FTP URL to use Aspera
        file = File(
            size_in_bytes=0,
            download_url=
            "ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR036/ERR036000/ERR036000_1.fastq.gz",  # noqa
            raw_format="fastq.gz",
            processed_format="tar.gz",
            name="ERR036000_1.fastq.gz",
            internal_location="IlluminaHiSeq2000/SALMON",
            batch=batch)
        dj = DownloaderJob()

        self.assertTrue(sra._download_file(file, dj, file.name))
Exemple #10
0
def _download_file(file: File,
                   downloader_job: DownloaderJob,
                   target_file_path: str,
                   force_ftp: bool = False) -> bool:
    """ Download file dispatcher. Dispatches to the FTP or Aspera downloader """

    # SRA files have Apsera downloads.
    if 'ftp.sra.ebi.ac.uk' in file.download_url and not force_ftp:
        # From: ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR036/ERR036000/ERR036000_1.fastq.gz
        # To: [email protected]:/vol1/fastq/ERR036/ERR036000/ERR036000_1.fastq.gz
        file.download_url = file.download_url.replace('ftp://', 'era-fasp@')
        file.download_url = file.download_url.replace('ftp', 'fasp')
        file.download_url = file.download_url.replace('.uk/', '.uk:/')

        return _download_file_aspera(file, downloader_job, target_file_path)
    else:
        return _download_file_ftp(file, downloader_job, target_file_path)
Exemple #11
0
    def run_trasnscriptome_processor(self):
        # Create all the dummy data that would have been created
        # before a processor job could have been generated.
        survey_job = SurveyJob(source_type="TRANSCRIPTOME_INDEX")
        survey_job.save()

        batch = Batch(
            survey_job=survey_job,
            source_type="TRANSCRIPTOME_INDEX",
            pipeline_required="TRANSCRIPTOME_INDEX",
            platform_accession_code="EnsemblPlants",
            experiment_accession_code="aegilops_tauschii",
            experiment_title="It doesn't really matter.",
            organism_id=37682,
            organism_name="AEGILOPS TAUSCHII",
            release_date="2017-11-02",
            last_uploaded_date="2017-11-02",
            status=BatchStatuses.DOWNLOADED.value,
        )
        batch.save()

        kmer_size_property = BatchKeyValue(batch=batch,
                                           key="kmer_size",
                                           value="31")
        kmer_size_property.save()

        gtf_file = File(
            name="aegilops_tauschii_short.gtf.gz",
            download_url=(
                "ftp://ftp.ensemblgenomes.org/pub/release-37/plants/gtf"
                "/aegilops_tauschii/Aegilops_tauschii.ASM34733v1.37.gtf.gz"),
            raw_format="gtf.gz",
            processed_format="tar.gz",
            internal_location="EnsemblPlants/TRANSCRIPTOME_INDEX",
            size_in_bytes=-1,
            batch=batch)
        gtf_file.save()

        fasta_file = File(
            name="aegilops_tauschii_short.fa.gz",
            download_url=(
                "ftp://ftp.ensemblgenomes.org/pub/release-37/plants/fasta"
                "/aegilops_tauschii/dna/Aegilops_tauschii."
                "ASM34733v1.dna.toplevel.fa.gz"),
            raw_format="fa.gz",
            processed_format="tar.gz",
            internal_location="EnsemblPlants/TRANSCRIPTOME_INDEX",
            size_in_bytes=-1,
            batch=batch)
        fasta_file.save()

        processor_job = ProcessorJob.create_job_and_relationships(
            batches=[batch])
        logger.info("Queuing a processor job.")
        send_job(ProcessorPipeline[batch.pipeline_required], processor_job.id)
Exemple #12
0
    def run_sra_processor(self):
        # Create all the dummy data that would have been created
        # before a processor job could have been generated.
        survey_job = SurveyJob(source_type="SRA")
        survey_job.save()

        batch = Batch(
            survey_job=survey_job,
            source_type="SRA",
            pipeline_required="SALMON",
            platform_accession_code="IlluminaHiSeq2500",
            experiment_accession_code="PRJEB5018",
            experiment_title="It doesn't really matter.",
            organism_id=10090,
            organism_name="MUS MUSCULUS",
            release_date="2014-03-25",
            last_uploaded_date="2016-05-20",
            status=BatchStatuses.NEW.value,
        )
        batch.save()

        File(name="ERR1680082_1.fastq",
             download_url=("ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR168/002/"
                           "ERR1680082/ERR1680082_1.fastq.gz"),
             raw_format="fastq",
             processed_format="sf",
             internal_location="IlluminaHiSeq2500/SALMON",
             size_in_bytes=2214725074,
             batch=batch).save()

        File(name="ERR1680082_2.fastq",
             download_url=("ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR168/002/"
                           "ERR1680082/ERR1680082_2.fastq.gz"),
             raw_format="fastq",
             processed_format="sf",
             internal_location="IlluminaHiSeq2500/SALMON",
             size_in_bytes=2214725074,
             batch=batch).save()

        processor_job = ProcessorJob.create_job_and_relationships(
            batches=[batch])
        logger.info("Queuing a processor job.")
        send_job(ProcessorPipeline[batch.pipeline_required], processor_job.id)
    def _generate_batch(self, species: Dict) -> None:
        url_builder = ensembl_url_builder_factory(species)
        fasta_download_url = url_builder.build_transcriptome_url()
        gtf_download_url = url_builder.build_gtf_url()

        current_time = timezone.now()
        platform_accession_code = species.pop("division")
        self._clean_metadata(species)

        for length in ("_long", "_short"):
            fasta_file_name = url_builder.file_name_species + length + ".fa.gz"
            fasta_file = File(name=fasta_file_name,
                              download_url=fasta_download_url,
                              raw_format="fa.gz",
                              processed_format="tar.gz",
                              size_in_bytes=-1)  # Will have to be determined later

            gtf_file_name = url_builder.file_name_species + length + ".gtf.gz"
            gtf_file = File(name=gtf_file_name,
                            download_url=gtf_download_url,
                            raw_format="gtf.gz",
                            processed_format="tar.gz",
                            size_in_bytes=-1)  # Will have to be determined later

            # Add a couple extra key/value pairs to the Batch.
            species["length"] = length
            species["kmer_size"] = "31" if length == "_long" else "23"

            self.add_batch(platform_accession_code=platform_accession_code,
                           experiment_accession_code=url_builder.file_name_species.upper(),
                           organism_id=url_builder.taxonomy_id,
                           organism_name=url_builder.scientific_name,
                           experiment_title="NA",
                           release_date=current_time,
                           last_uploaded_date=current_time,
                           files=[fasta_file, gtf_file],
                           # Store the rest of the metadata about these!
                           key_values=species)
def init_objects():
    survey_job = SurveyJob(source_type="TRANSCRIPTOME_INDEX")
    survey_job.save()

    batch = Batch(survey_job=survey_job,
                  source_type="TRANSCRIPTOME_INDEX",
                  pipeline_required="TRANSCRIPTOME_INDEX",
                  platform_accession_code="EnsemblPlants",
                  experiment_accession_code="aegilops_tauschii",
                  experiment_title="It doesn't really matter.",
                  organism_id=37682,
                  organism_name="AEGILOPS TAUSCHII",
                  release_date="2017-11-02",
                  last_uploaded_date="2017-11-02",
                  status=BatchStatuses.DOWNLOADED.value)
    batch.save()
    BatchKeyValue(batch=batch, key="length", value="_short").save()
    BatchKeyValue(batch=batch, key="kmer_size", value="23").save()

    gtf_file = File(
        size_in_bytes=-1,
        raw_format="gtf.gz",
        processed_format="tar.gz",
        name="aegilops_tauschii_short.gtf.gz",
        internal_location="EnsemblPlants/TRANSCRIPTOME_INDEX",
        download_url=(
            "ftp://ftp.ensemblgenomes.org/pub/release-37/plants/gtf"
            "/aegilops_tauschii/Aegilops_tauschii.ASM34733v1.37.gtf.gz"),
        batch=batch)
    gtf_file.save()

    fasta_file = File(
        size_in_bytes=-1,
        raw_format="fa.gz",
        processed_format="tar.gz",
        name="aegilops_tauschii_short.fa.gz",
        internal_location="EnsemblPlants/TRANSCRIPTOME_INDEX",
        download_url=(
            "ftp://ftp.ensemblgenomes.org/pub/release-37/plants/fasta"
            "/aegilops_tauschii/dna/Aegilops_tauschii."
            "ASM34733v1.dna.toplevel.fa.gz"),
        batch=batch)
    fasta_file.save()

    batch.files = [gtf_file, fasta_file]
    return (batch, gtf_file, fasta_file)
Exemple #15
0
    def _build_file(run_accession: str, read_suffix="") -> File:
        # ENA has a weird way of nesting data: if the run accession is
        # greater than 9 characters long then there is an extra
        # sub-directory in the path which is "00" + the last digit of
        # the run accession.
        sub_dir = ""
        if len(run_accession) > 9:
            sub_dir = ENA_SUB_DIR_PREFIX + run_accession[-1]

        return File(name=(run_accession + read_suffix + ".fastq.gz"),
                    download_url=ENA_DOWNLOAD_URL_TEMPLATE.format(
                        short_accession=run_accession[:6],
                        sub_dir=sub_dir,
                        long_accession=run_accession,
                        read_suffix=read_suffix),
                    raw_format="fastq.gz",
                    processed_format="tar.gz",
                    size_in_bytes=-1)  # Will have to be determined later
Exemple #16
0
def init_objects():
    survey_job = SurveyJob(source_type="SALMON")
    survey_job.save()

    batch = Batch(survey_job=survey_job,
                  source_type="SALMON",
                  pipeline_required="SALMON",
                  platform_accession_code="IlluminaGenomeAnalyzerII",
                  experiment_accession_code="ERX000259",
                  experiment_title="It doesn't really matter.",
                  organism_id=9606,
                  organism_name="H**O SAPIENS",
                  release_date="2017-11-02",
                  last_uploaded_date="2017-11-02",
                  status=BatchStatuses.DOWNLOADED.value)
    batch.save()

    first_fastq_file = File(
        size_in_bytes=2214725074,
        raw_format="fastq.gz",
        processed_format="tar.gz",
        name="ERR003000_1.fastq.gz",
        internal_location="IlluminaGenomeAnalyzerII/SALMON",
        download_url=("ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR003/"
                      "ERR003000/ERR003000_1.fastq.gz"),
        batch=batch)
    first_fastq_file.save()

    second_fastq_file = File(
        size_in_bytes=2214725074,
        raw_format="fastq.gz",
        processed_format="tar.gz",
        name="ERR003000_2.fastq.gz",
        internal_location="IlluminaGenomeAnalyzerII/SALMON",
        download_url=("ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR003/"
                      "ERR003000/ERR003000_2.fastq.gz"),
        batch=batch)
    second_fastq_file.save()

    batch.files = [first_fastq_file, second_fastq_file]
    return (batch, first_fastq_file, second_fastq_file)
    def insert_objects(self) -> List[Batch]:
        download_url = "ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/GEOD/E-GEOD-59071/E-GEOD-59071.raw.3.zip"  # noqa
        batch1 = Batch(survey_job=self.survey_job,
                       source_type="TRANSCRIPTOME_INDEX",
                       pipeline_required="TRANSCRIPTOME_INDEX",
                       platform_accession_code="EnsemblPlants",
                       experiment_accession_code="AEGILOPS_TAUSCHII",
                       experiment_title="It doesn't really matter.",
                       organism_id=37682,
                       organism_name="AEGILOPS TAUSCHII",
                       release_date="2017-05-05",
                       last_uploaded_date="2017-05-05",
                       status=BatchStatuses.NEW.value)
        batch2 = copy.deepcopy(batch1)
        batch1.save()
        batch2.save()

        for batch, length, kmer_size in [(batch1, "_short", "23"),
                                         (batch2, "_long", "31")]:
            BatchKeyValue(batch=batch, key="length", value=length).save()
            BatchKeyValue(batch=batch, key="kmer_size", value=kmer_size).save()

            file1 = File(size_in_bytes=0,
                         download_url=self.fasta_download_url,
                         raw_format="fa.gz",
                         processed_format="tar.gz",
                         name="Aegilops_tauschii{}.fa.gz".format(length),
                         internal_location="EnsemblPlants/TRANSCRIPTOME_INDEX",
                         batch=batch)
            file2 = File(size_in_bytes=0,
                         download_url=self.gtf_download_url,
                         raw_format="gtf.gz",
                         processed_format="tar.gz",
                         name="Aegilops_tauschii{}.gtf.gz".format(length),
                         internal_location="EnsemblPlants/TRANSCRIPTOME_INDEX",
                         batch=batch)
            file1.save()
            file2.save()
            batch.files = [file1, file2]

        return [batch1, batch2]
Exemple #18
0
    def insert_objects(self) -> List[Batch]:
        download_url = "ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/GEOD/E-GEOD-59071/E-GEOD-59071.raw.3.zip"  # noqa
        batch = Batch(survey_job=self.survey_job,
                      source_type="SRA",
                      pipeline_required="SALMON",
                      platform_accession_code="IlluminaHiSeq2000",
                      experiment_accession_code="DRX001563",
                      experiment_title="It doesn't really matter.",
                      organism_id=9031,
                      organism_name="GALLUS GALLUS",
                      release_date="2013-07-19",
                      last_uploaded_date="2017-09-11",
                      status=BatchStatuses.NEW.value)
        batch.save()

        file = File(
            size_in_bytes=0,
            download_url=
            "ftp://ftp.sra.ebi.ac.uk/vol1/fastq/DRR002/DRR002116/DRR002116_1.fastq.gz",  # noqa
            raw_format="fastq.gz",
            processed_format="tar.gz",
            name="DRR002116_1.fastq.gz",
            internal_location="IlluminaHiSeq2000/SALMON",
            batch=batch)
        file2 = File(
            size_in_bytes=0,
            download_url=
            "ftp://ftp.sra.ebi.ac.uk/vol1/fastq/DRR002/DRR002116/DRR002116_2.fastq.gz",  # noqa
            raw_format="fastq.gz",
            processed_format="tar.gz",
            name="DRR002116_2.fastq.gz",
            internal_location="IlluminaHiSeq2000/SALMON",
            batch=batch)

        file.save()
        file2.save()
        batch.files = [file, file2]
        return (batch, [file, file2])
    def insert_objects(self) -> List[Batch]:
        download_url = "ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/GEOD/E-GEOD-59071/E-GEOD-59071.raw.3.zip"  # noqa
        batch = Batch(survey_job=self.survey_job,
                      source_type="ARRAY_EXPRESS",
                      pipeline_required="AFFY_TO_PCL",
                      platform_accession_code="A-AFFY-1",
                      experiment_accession_code="E-MTAB-3050",
                      experiment_title="It doesn't really matter.",
                      organism_id=9606,
                      organism_name="H**O SAPIENS",
                      release_date="2017-05-05",
                      last_uploaded_date="2017-05-05",
                      status=BatchStatuses.NEW.value)
        batch2 = copy.deepcopy(batch)
        batch.save()
        batch2.save()

        file = File(size_in_bytes=0,
                    download_url=download_url,
                    raw_format="CEL",
                    processed_format="PCL",
                    name="CE1234.CEL",
                    internal_location="A-AFFY-1/AFFY_TO_PCL/",
                    batch=batch)
        file2 = File(size_in_bytes=0,
                     download_url=download_url,
                     raw_format="CEL",
                     processed_format="PCL",
                     name="CE2345.CEL",
                     internal_location="A-AFFY-1/AFFY_TO_PCL/",
                     batch=batch2)
        file.save()
        file2.save()

        batch.files = [file]
        batch2.files = [file]

        return ([batch, batch2], [file, file2])
Exemple #20
0
    def _generate_batches(self,
                          samples: List[Dict],
                          experiment: Dict,
                          replicate_raw: bool = True) -> List[Batch]:
        """Generates a Batch for each sample in samples.

        Uses the metadata contained in experiment (which should be
        generated via get_experiment_metadata) to add additional
        metadata to each Batch. If replicate_raw is True (the default)
        then only raw files will be replicated. Otherwise all files
        will be replicated.
        """
        for sample in samples:
            if "file" not in sample:
                continue

            organism_name = "UNKNOWN"
            for characteristic in sample["characteristic"]:
                if characteristic["category"].upper() == "ORGANISM":
                    organism_name = characteristic["value"].upper()

            if organism_name == "UNKNOWN":
                logger.error(
                    "Sample from experiment %s did not specify the organism name.",
                    experiment["experiment_accession_code"],
                    survey_job=self.survey_job.id)
                organism_id = 0
            else:
                organism_id = Organism.get_id_for_name(organism_name)

            for sample_file in sample["file"]:
                # Generally we only want to replicate the raw data if
                # we can, however if there isn't raw data then we'll
                # take the processed stuff.
                if (replicate_raw and sample_file["type"] != "data") \
                        or sample_file["name"] is None:
                    continue

                # sample_file["comment"] is only a list if there's
                # more than one comment...
                comments = sample_file["comment"]
                if isinstance(comments, list):
                    # Could be: "Derived ArrayExpress Data Matrix FTP
                    # file" or: "ArrayExpress FTP file". If there is
                    # no comment with a name including "FTP file" then
                    # we don't know where to download it so we need to
                    # mark this job as an error. Therefore don't catch
                    # the potential exception where download_url
                    # doesn't get defined.
                    for comment in comments:
                        if comment["name"].find("FTP file") != -1:
                            download_url = comment["value"]
                else:
                    download_url = comments["value"]

                raw_format = sample_file["name"].split(".")[-1]
                processed_format = "PCL" if replicate_raw else raw_format

                file = File(
                    name=sample_file["name"],
                    download_url=download_url,
                    raw_format=raw_format,
                    processed_format=processed_format,
                    size_in_bytes=-1)  # Will have to be determined later

                self.add_batch(
                    platform_accession_code=experiment[
                        "platform_accession_code"],
                    experiment_accession_code=experiment[
                        "experiment_accession_code"],
                    organism_id=organism_id,
                    organism_name=organism_name,
                    experiment_title=experiment["name"],
                    release_date=experiment["release_date"],
                    last_uploaded_date=experiment["last_update_date"],
                    files=[file])