def init_objects():
    survey_job = SurveyJob(source_type="ARRAY_EXPRESS")
    survey_job.save()

    batch = Batch(
        survey_job=survey_job,
        source_type="ARRAY_EXPRESS",
        pipeline_required="AFFY_TO_PCL",
        platform_accession_code="A-AFFY-1",
        experiment_accession_code="E-MTAB-3050",
        experiment_title="It doesn't really matter.",
        organism_id=9606,
        organism_name="H**O SAPIENS",
        release_date="2017-05-05",
        last_uploaded_date="2017-05-05",
        status=BatchStatuses.DOWNLOADED.value
    )
    batch.save()

    file = File(size_in_bytes=0,
                download_url="ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/GEOD/E-GEOD-59071/E-GEOD-59071.raw.3.zip",  # noqa
                raw_format="CEL",
                processed_format="PCL",
                name="CE1234.CEL",
                internal_location="A-AFFY-1/AFFY_TO_PCL/",
                batch=batch)
    file.save()

    batch.files = [file]
    return batch
Exemple #2
0
    def test_multiple_batches(self, mock_download_file):
        # Just in case this test ever breaks, we don't actually want
        # to download the file because that'll take a while to fail.
        mock_download_file.return_value = True

        batch, _ = self.insert_objects()
        batch2 = Batch(survey_job=self.survey_job,
                       source_type="SRA",
                       pipeline_required="SALMON",
                       platform_accession_code="IlluminaHiSeq2000",
                       experiment_accession_code="DRX001564",
                       experiment_title="It doesn't really matter.",
                       organism_id=9031,
                       organism_name="GALLUS GALLUS",
                       release_date="2013-07-19",
                       last_uploaded_date="2017-09-11",
                       status=BatchStatuses.NEW.value)
        batch2.save()
        downloader_job = DownloaderJob.create_job_and_relationships(
            batches=[batch, batch2], downloader_task="dummy")
        downloader_job.save()

        sra.download_sra(downloader_job.id)

        completed_job = DownloaderJob.objects.get(id=downloader_job.id)
        self.assertFalse(completed_job.success)
        self.assertEqual(completed_job.failure_reason,
                         ("More than one batch found for SRA downloader job. "
                          "There should only be one."))
Exemple #3
0
    def setUpClass(cls):
        survey_job = SurveyJob(
            source_type="ARRAY_EXPRESS"
        )
        survey_job.save()

        batch = Batch(
            survey_job=survey_job,
            source_type="ARRAY_EXPRESS",
            pipeline_required="AFFY_TO_PCL",
            platform_accession_code="A-AFFY-141",
            experiment_accession_code="E-GEOD-59071",
            experiment_title="It doesn't really matter.",
            organism_id=9606,
            organism_name="H**O SAPIENS",
            release_date="2017-05-05",
            last_uploaded_date="2017-05-05",
            status=BatchStatuses.NEW.value
        )
        batch.save()

        file = File(
            size_in_bytes=0,
            download_url="ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/GEOD/E-GEOD-59071/E-GEOD-59071.raw.3.zip",  # noqa
            raw_format="CEL",
            processed_format="PCL",
            name="GSM1426072.CEL",
            internal_location="A-AFFY-141/AFFY_TO_PCL",
            batch=batch
        )
        file.save()

        super(FilesTestCase, cls).setUpClass()
Exemple #4
0
def _insert_salmon_index():
    """Creates a batch for the index for the organism for the test."""
    survey_job = SurveyJob(source_type="TRANSCRIPTOME_INDEX")
    survey_job.save()

    batch = Batch(survey_job=survey_job,
                  source_type="TRANSCRIPTOME_INDEX",
                  pipeline_required="TRANSCRIPTOME_INDEX",
                  platform_accession_code="TEST",
                  experiment_accession_code="HOMO_SAPIENS",
                  experiment_title="It doesn't really matter.",
                  organism_id=9606,
                  organism_name="H**O SAPIENS",
                  release_date="2017-11-02",
                  last_uploaded_date="2017-11-02",
                  status=BatchStatuses.PROCESSED.value)
    batch.save()

    kmer_size = BatchKeyValue(key="kmer_size", value="23", batch=batch)
    kmer_size.save()

    index_file = File(
        size_in_bytes=2214725074,
        raw_format="gtf.gz",
        processed_format="tar.gz",
        name="Homo_sapiens_short.gtf.gz",
        internal_location="TEST/TRANSCRIPTOME_INDEX",
        download_url=("ftp://ftp.ensembl.org/pub/release-90/gtf/homo_sapiens"
                      "/Homo_sapiens.GRCh38.90.gtf.gz"),
        batch=batch)
    index_file.save()
Exemple #5
0
def get_batch():
    survey_job = SurveyJob(source_type="ARRAY_EXPRESS")
    survey_job.save()

    batch = Batch(survey_job=survey_job,
                  source_type="ARRAY_EXPRESS",
                  pipeline_required="AFFY_TO_PCL",
                  platform_accession_code="A-AFFY-1",
                  experiment_accession_code="E-MTAB-3050",
                  experiment_title="It doesn't really matter.",
                  organism_id=9606,
                  organism_name="H**O SAPIENS",
                  release_date="2017-05-05",
                  last_uploaded_date="2017-05-05",
                  status=BatchStatuses.NEW.value)
    batch.save()

    File(size_in_bytes=0,
         download_url="example.com",
         raw_format="CEL",
         processed_format="PCL",
         name="CE1234.CEL",
         internal_location="A-AFFY-1/AFFY_TO_PCL/",
         batch=batch).save()
    return batch
Exemple #6
0
    def handle(self, *args, **options):
        # Create all the dummy data that would have been created
        # before a downloader job could have been generated.
        survey_job = SurveyJob(source_type="ARRAY_EXPRESS")
        survey_job.save()

        batch = Batch(survey_job=survey_job,
                      source_type="ARRAY_EXPRESS",
                      pipeline_required="AFFY_TO_PCL",
                      platform_accession_code="A-AFFY-141",
                      experiment_accession_code="E-GEOD-59071",
                      experiment_title="It doesn't really matter.",
                      organism_id=9606,
                      organism_name="H**O SAPIENS",
                      release_date="2017-05-05",
                      last_uploaded_date="2017-05-05",
                      status=BatchStatuses.NEW.value)
        batch.save()

        file = File(
            batch=batch,
            size_in_bytes=0,
            download_url=
            "ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/GEOD/E-GEOD-59071/E-GEOD-59071.raw.3.zip",  # noqa
            raw_format="CEL",
            processed_format="PCL",
            name="GSM1426072_CD_colon_active_2.CEL",
            internal_location="A-AFFY-141/AFFY_TO_PCL")
        file.save()

        downloader_job = DownloaderJob.create_job_and_relationships(
            batches=[batch])
        send_job(Downloaders["ARRAY_EXPRESS"], downloader_job.id)
Exemple #7
0
    def test_aspera_downloader(self):
        """ """

        batch = Batch(survey_job=self.survey_job,
                      source_type="SRA",
                      pipeline_required="SALMON",
                      platform_accession_code="IlluminaHiSeq2000",
                      experiment_accession_code="DRX001563",
                      experiment_title="It doesn't really matter.",
                      organism_id=9031,
                      organism_name="GALLUS GALLUS",
                      release_date="2013-07-19",
                      last_uploaded_date="2017-09-11",
                      status=BatchStatuses.NEW.value)
        batch.save()

        # This is converted from FTP URL to use Aspera
        file = File(
            size_in_bytes=0,
            download_url=
            "ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR036/ERR036000/ERR036000_1.fastq.gz",  # noqa
            raw_format="fastq.gz",
            processed_format="tar.gz",
            name="ERR036000_1.fastq.gz",
            internal_location="IlluminaHiSeq2000/SALMON",
            batch=batch)
        dj = DownloaderJob()

        self.assertTrue(sra._download_file(file, dj, file.name))
    def run_trasnscriptome_processor(self):
        # Create all the dummy data that would have been created
        # before a processor job could have been generated.
        survey_job = SurveyJob(source_type="TRANSCRIPTOME_INDEX")
        survey_job.save()

        batch = Batch(
            survey_job=survey_job,
            source_type="TRANSCRIPTOME_INDEX",
            pipeline_required="TRANSCRIPTOME_INDEX",
            platform_accession_code="EnsemblPlants",
            experiment_accession_code="aegilops_tauschii",
            experiment_title="It doesn't really matter.",
            organism_id=37682,
            organism_name="AEGILOPS TAUSCHII",
            release_date="2017-11-02",
            last_uploaded_date="2017-11-02",
            status=BatchStatuses.DOWNLOADED.value,
        )
        batch.save()

        kmer_size_property = BatchKeyValue(batch=batch,
                                           key="kmer_size",
                                           value="31")
        kmer_size_property.save()

        gtf_file = File(
            name="aegilops_tauschii_short.gtf.gz",
            download_url=(
                "ftp://ftp.ensemblgenomes.org/pub/release-37/plants/gtf"
                "/aegilops_tauschii/Aegilops_tauschii.ASM34733v1.37.gtf.gz"),
            raw_format="gtf.gz",
            processed_format="tar.gz",
            internal_location="EnsemblPlants/TRANSCRIPTOME_INDEX",
            size_in_bytes=-1,
            batch=batch)
        gtf_file.save()

        fasta_file = File(
            name="aegilops_tauschii_short.fa.gz",
            download_url=(
                "ftp://ftp.ensemblgenomes.org/pub/release-37/plants/fasta"
                "/aegilops_tauschii/dna/Aegilops_tauschii."
                "ASM34733v1.dna.toplevel.fa.gz"),
            raw_format="fa.gz",
            processed_format="tar.gz",
            internal_location="EnsemblPlants/TRANSCRIPTOME_INDEX",
            size_in_bytes=-1,
            batch=batch)
        fasta_file.save()

        processor_job = ProcessorJob.create_job_and_relationships(
            batches=[batch])
        logger.info("Queuing a processor job.")
        send_job(ProcessorPipeline[batch.pipeline_required], processor_job.id)
Exemple #9
0
    def test_survey(self):
        survey_job = SurveyJob(source_type="ARRAY_EXPRESS")
        surveyor = ArrayExpressSurveyor(survey_job)
        file1 = File(download_url="a")
        file2 = File(download_url="a")
        file3 = File(download_url="b")
        file4 = File(download_url="a")
        batch1 = Batch(files=[file1])
        batch2 = Batch(files=[file2])
        batch3 = Batch(files=[file3, file4])

        surveyor.batches = [batch1, batch2, batch3]
        groups = surveyor.group_batches()
        self.assertEqual(groups, [[batch1, batch2], [batch3]])
def init_objects():
    survey_job = SurveyJob(source_type="TRANSCRIPTOME_INDEX")
    survey_job.save()

    batch = Batch(survey_job=survey_job,
                  source_type="TRANSCRIPTOME_INDEX",
                  pipeline_required="TRANSCRIPTOME_INDEX",
                  platform_accession_code="EnsemblPlants",
                  experiment_accession_code="aegilops_tauschii",
                  experiment_title="It doesn't really matter.",
                  organism_id=37682,
                  organism_name="AEGILOPS TAUSCHII",
                  release_date="2017-11-02",
                  last_uploaded_date="2017-11-02",
                  status=BatchStatuses.DOWNLOADED.value)
    batch.save()
    BatchKeyValue(batch=batch, key="length", value="_short").save()
    BatchKeyValue(batch=batch, key="kmer_size", value="23").save()

    gtf_file = File(
        size_in_bytes=-1,
        raw_format="gtf.gz",
        processed_format="tar.gz",
        name="aegilops_tauschii_short.gtf.gz",
        internal_location="EnsemblPlants/TRANSCRIPTOME_INDEX",
        download_url=(
            "ftp://ftp.ensemblgenomes.org/pub/release-37/plants/gtf"
            "/aegilops_tauschii/Aegilops_tauschii.ASM34733v1.37.gtf.gz"),
        batch=batch)
    gtf_file.save()

    fasta_file = File(
        size_in_bytes=-1,
        raw_format="fa.gz",
        processed_format="tar.gz",
        name="aegilops_tauschii_short.fa.gz",
        internal_location="EnsemblPlants/TRANSCRIPTOME_INDEX",
        download_url=(
            "ftp://ftp.ensemblgenomes.org/pub/release-37/plants/fasta"
            "/aegilops_tauschii/dna/Aegilops_tauschii."
            "ASM34733v1.dna.toplevel.fa.gz"),
        batch=batch)
    fasta_file.save()

    batch.files = [gtf_file, fasta_file]
    return (batch, gtf_file, fasta_file)
Exemple #11
0
    def run_sra_processor(self):
        # Create all the dummy data that would have been created
        # before a processor job could have been generated.
        survey_job = SurveyJob(source_type="SRA")
        survey_job.save()

        batch = Batch(
            survey_job=survey_job,
            source_type="SRA",
            pipeline_required="SALMON",
            platform_accession_code="IlluminaHiSeq2500",
            experiment_accession_code="PRJEB5018",
            experiment_title="It doesn't really matter.",
            organism_id=10090,
            organism_name="MUS MUSCULUS",
            release_date="2014-03-25",
            last_uploaded_date="2016-05-20",
            status=BatchStatuses.NEW.value,
        )
        batch.save()

        File(name="ERR1680082_1.fastq",
             download_url=("ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR168/002/"
                           "ERR1680082/ERR1680082_1.fastq.gz"),
             raw_format="fastq",
             processed_format="sf",
             internal_location="IlluminaHiSeq2500/SALMON",
             size_in_bytes=2214725074,
             batch=batch).save()

        File(name="ERR1680082_2.fastq",
             download_url=("ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR168/002/"
                           "ERR1680082/ERR1680082_2.fastq.gz"),
             raw_format="fastq",
             processed_format="sf",
             internal_location="IlluminaHiSeq2500/SALMON",
             size_in_bytes=2214725074,
             batch=batch).save()

        processor_job = ProcessorJob.create_job_and_relationships(
            batches=[batch])
        logger.info("Queuing a processor job.")
        send_job(ProcessorPipeline[batch.pipeline_required], processor_job.id)
    def insert_objects(self) -> List[Batch]:
        download_url = "ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/GEOD/E-GEOD-59071/E-GEOD-59071.raw.3.zip"  # noqa
        batch1 = Batch(survey_job=self.survey_job,
                       source_type="TRANSCRIPTOME_INDEX",
                       pipeline_required="TRANSCRIPTOME_INDEX",
                       platform_accession_code="EnsemblPlants",
                       experiment_accession_code="AEGILOPS_TAUSCHII",
                       experiment_title="It doesn't really matter.",
                       organism_id=37682,
                       organism_name="AEGILOPS TAUSCHII",
                       release_date="2017-05-05",
                       last_uploaded_date="2017-05-05",
                       status=BatchStatuses.NEW.value)
        batch2 = copy.deepcopy(batch1)
        batch1.save()
        batch2.save()

        for batch, length, kmer_size in [(batch1, "_short", "23"),
                                         (batch2, "_long", "31")]:
            BatchKeyValue(batch=batch, key="length", value=length).save()
            BatchKeyValue(batch=batch, key="kmer_size", value=kmer_size).save()

            file1 = File(size_in_bytes=0,
                         download_url=self.fasta_download_url,
                         raw_format="fa.gz",
                         processed_format="tar.gz",
                         name="Aegilops_tauschii{}.fa.gz".format(length),
                         internal_location="EnsemblPlants/TRANSCRIPTOME_INDEX",
                         batch=batch)
            file2 = File(size_in_bytes=0,
                         download_url=self.gtf_download_url,
                         raw_format="gtf.gz",
                         processed_format="tar.gz",
                         name="Aegilops_tauschii{}.gtf.gz".format(length),
                         internal_location="EnsemblPlants/TRANSCRIPTOME_INDEX",
                         batch=batch)
            file1.save()
            file2.save()
            batch.files = [file1, file2]

        return [batch1, batch2]
Exemple #13
0
def init_objects():
    survey_job = SurveyJob(source_type="SALMON")
    survey_job.save()

    batch = Batch(survey_job=survey_job,
                  source_type="SALMON",
                  pipeline_required="SALMON",
                  platform_accession_code="IlluminaGenomeAnalyzerII",
                  experiment_accession_code="ERX000259",
                  experiment_title="It doesn't really matter.",
                  organism_id=9606,
                  organism_name="H**O SAPIENS",
                  release_date="2017-11-02",
                  last_uploaded_date="2017-11-02",
                  status=BatchStatuses.DOWNLOADED.value)
    batch.save()

    first_fastq_file = File(
        size_in_bytes=2214725074,
        raw_format="fastq.gz",
        processed_format="tar.gz",
        name="ERR003000_1.fastq.gz",
        internal_location="IlluminaGenomeAnalyzerII/SALMON",
        download_url=("ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR003/"
                      "ERR003000/ERR003000_1.fastq.gz"),
        batch=batch)
    first_fastq_file.save()

    second_fastq_file = File(
        size_in_bytes=2214725074,
        raw_format="fastq.gz",
        processed_format="tar.gz",
        name="ERR003000_2.fastq.gz",
        internal_location="IlluminaGenomeAnalyzerII/SALMON",
        download_url=("ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR003/"
                      "ERR003000/ERR003000_2.fastq.gz"),
        batch=batch)
    second_fastq_file.save()

    batch.files = [first_fastq_file, second_fastq_file]
    return (batch, first_fastq_file, second_fastq_file)
    def insert_objects(self) -> List[Batch]:
        download_url = "ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/GEOD/E-GEOD-59071/E-GEOD-59071.raw.3.zip"  # noqa
        batch = Batch(survey_job=self.survey_job,
                      source_type="ARRAY_EXPRESS",
                      pipeline_required="AFFY_TO_PCL",
                      platform_accession_code="A-AFFY-1",
                      experiment_accession_code="E-MTAB-3050",
                      experiment_title="It doesn't really matter.",
                      organism_id=9606,
                      organism_name="H**O SAPIENS",
                      release_date="2017-05-05",
                      last_uploaded_date="2017-05-05",
                      status=BatchStatuses.NEW.value)
        batch2 = copy.deepcopy(batch)
        batch.save()
        batch2.save()

        file = File(size_in_bytes=0,
                    download_url=download_url,
                    raw_format="CEL",
                    processed_format="PCL",
                    name="CE1234.CEL",
                    internal_location="A-AFFY-1/AFFY_TO_PCL/",
                    batch=batch)
        file2 = File(size_in_bytes=0,
                     download_url=download_url,
                     raw_format="CEL",
                     processed_format="PCL",
                     name="CE2345.CEL",
                     internal_location="A-AFFY-1/AFFY_TO_PCL/",
                     batch=batch2)
        file.save()
        file2.save()

        batch.files = [file]
        batch2.files = [file]

        return ([batch, batch2], [file, file2])
Exemple #15
0
    def insert_objects(self) -> List[Batch]:
        download_url = "ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/GEOD/E-GEOD-59071/E-GEOD-59071.raw.3.zip"  # noqa
        batch = Batch(survey_job=self.survey_job,
                      source_type="SRA",
                      pipeline_required="SALMON",
                      platform_accession_code="IlluminaHiSeq2000",
                      experiment_accession_code="DRX001563",
                      experiment_title="It doesn't really matter.",
                      organism_id=9031,
                      organism_name="GALLUS GALLUS",
                      release_date="2013-07-19",
                      last_uploaded_date="2017-09-11",
                      status=BatchStatuses.NEW.value)
        batch.save()

        file = File(
            size_in_bytes=0,
            download_url=
            "ftp://ftp.sra.ebi.ac.uk/vol1/fastq/DRR002/DRR002116/DRR002116_1.fastq.gz",  # noqa
            raw_format="fastq.gz",
            processed_format="tar.gz",
            name="DRR002116_1.fastq.gz",
            internal_location="IlluminaHiSeq2000/SALMON",
            batch=batch)
        file2 = File(
            size_in_bytes=0,
            download_url=
            "ftp://ftp.sra.ebi.ac.uk/vol1/fastq/DRR002/DRR002116/DRR002116_2.fastq.gz",  # noqa
            raw_format="fastq.gz",
            processed_format="tar.gz",
            name="DRR002116_2.fastq.gz",
            internal_location="IlluminaHiSeq2000/SALMON",
            batch=batch)

        file.save()
        file2.save()
        batch.files = [file, file2]
        return (batch, [file, file2])
Exemple #16
0
    def add_batch(self,
                  platform_accession_code: str,
                  experiment_accession_code: str,
                  organism_id: int,
                  organism_name: str,
                  experiment_title: str,
                  release_date,
                  last_uploaded_date,
                  files: List[File],
                  key_values: Dict = {}):
        # Prevent creating duplicate Batches.
        for file in files:
            if File.objects.filter(name=file.name).count() != 0:
                logger.info((
                    "Skipping sample with name %s because a File already exists with "
                    "that name."), file.name)
                return

        batch = Batch(survey_job=self.survey_job,
                      source_type=self.source_type(),
                      status=BatchStatuses.NEW.value,
                      platform_accession_code=platform_accession_code,
                      experiment_accession_code=experiment_accession_code,
                      organism_id=organism_id,
                      organism_name=organism_name,
                      experiment_title=experiment_title,
                      release_date=release_date,
                      last_uploaded_date=last_uploaded_date)
        batch.files = files
        batch.pipeline_required = self.determine_pipeline(batch,
                                                          key_values).value
        batch.save()

        for file in files:
            file.internal_location = os.path.join(
                batch.platform_accession_code, batch.pipeline_required)
            file.batch = batch
            file.save()

        for key, value in key_values.items():
            BatchKeyValue(batch=batch, key=key, value=value).save()

        self.batches.append(batch)