Beispiel #1
0
def download_fastq_files(fastq1_s3_path, fastq2_s3_path, working_dir):
    """
    Downlodas the fastq files
    :param fastq1_s3_path: S3 path containing FASTQ with read1
    :param fastq2_s3_path: S3 path containing FASTQ with read2
    :param working_dir: working directory
    :return: local path to the folder containing the fastq
    """
    fastq_folder = os.path.join(working_dir, 'fastq')

    try:
        os.mkdir(fastq_folder)
    except Exception as e:
        pass

    local_fastq1_path = download_file(fastq1_s3_path, fastq_folder)
    local_fastq2_path = download_file(fastq2_s3_path, fastq_folder)

    # Isaac requires the fastqs to be symlinked as lane1_read1.fastq.gz and lane1_read2.fastq.gz
    os.symlink(local_fastq1_path,
               os.path.join(fastq_folder, 'lane1_read1.fastq.gz'))
    os.symlink(local_fastq2_path,
               os.path.join(fastq_folder, 'lane1_read2.fastq.gz'))

    return fastq_folder
    def handle_s3_download(self):
        """
        Download s3 objects locally and reset the "args"
        """
        if self.args.s3_upload:
            from job_utils import generate_working_dir, delete_working_dir  # , setup_logger
            from s3_utils import download_file, upload_file, download_folder, upload_folder, read_s3_file

            tmp_dir = generate_working_dir('/mnt/temp')

            logging.info("Downloading to {0}".format(tmp_dir))

            download_file(self.args.bam, tmp_dir)
            self.args.bam = os.path.join(tmp_dir,
                                         os.path.basename(self.args.bam))

            download_file(self.args.fasta, tmp_dir)
            self.args.fasta = os.path.join(tmp_dir,
                                           os.path.basename(self.args.fasta))

            download_file(self.args.stb, tmp_dir)
            self.args.stb = os.path.join(tmp_dir,
                                         os.path.basename(self.args.stb))

            outdir = generate_working_dir('/mnt/scratch')
            self.args.output = os.path.join(outdir,
                                            os.path.basename(self.args.output))
def build_country_data(demographic_data=DEMOGRAPHIC_DATA, bed_data=BED_DATA):
    disease_data_bytes, last_modified = download_file(
        "latest_disease_data.csv")
    disease_data = pd.read_csv(StringIO(disease_data_bytes.decode()),
                               index_col="Country/Region")
    # Rename name "US" to "United States" in disease and demographics data to match bed data
    disease_data = disease_data.rename(index={"US": "United States"})
    demographic_data = demographic_data.rename(index={"US": "United States"})

    country_data = disease_data.merge(demographic_data, on="Country/Region")

    # Beds are per 1000 people so we need to calculate absolute

    bed_data = bed_data.merge(demographic_data, on="Country/Region")

    bed_data["Num Hospital Beds"] = (bed_data["Latest Bed Estimate"] *
                                     bed_data["Population"] / 1000)

    country_data = country_data.merge(bed_data[["Num Hospital Beds"]],
                                      on="Country/Region")
    return country_data.to_dict(orient="index"), last_modified
Beispiel #4
0
 def download_data():
     download_file(local="data.pkl", remote="data.pkl")
Beispiel #5
0
def download_data(args, working_dir, tmp_dir):
    cmd_string = ''

    # Download the .bam
    if args.bam != None:
        logging.info("Downloading bam to {0}".format(tmp_dir))
        download_file(args.bam, tmp_dir)
        cmd_string = cmd_string + os.path.join(
            tmp_dir, os.path.basename(args.bam)) + ' '

        try:
            logging.info("Downloading bam index".format(tmp_dir))
            download_file(args.bam + '.bai', tmp_dir)
        except:
            pass

    # Downlaod the .fasta
    if args.fasta != None:
        logging.info("Downloading fasta to {0}".format(tmp_dir))
        download_file(args.fasta, tmp_dir)
        cmd_string = cmd_string + os.path.join(
            tmp_dir, os.path.basename(args.fasta)) + ' '

    # Download the IS profiles
    if args.IS is not None:
        cmd_string += '-i '

        if len(args.IS) > 1:
            is_locs = args.IS
        else:
            is_locs = read_s3_file(args.IS[0]).split('\n')

        for is_loc in is_locs:
            # Get the name
            is_loc = is_loc.strip()
            if len(is_loc) == 0:
                continue

            if is_loc[-1] == '/':
                is_name = is_loc.split('/')[-2]
            else:
                is_name = is_loc.split('/')[-1]
            is_dir = os.path.join(tmp_dir, is_name)

            logging.info("Downloading IS to {0}".format(is_dir))
            #download_folder(is_loc, is_dir)
            download_folder(is_loc,
                            is_dir,
                            exclude=['*'],
                            include=[
                                '*attributes*', '*cumulative_snv_table*',
                                '*scaffold2length*', '*covT*'
                            ])
            cmd_string += ' {0} '.format(is_dir)

    # Download other files
    for f, name in zip([args.genes, args.stb, args.scaffolds],
                       ['-g', '-s', '--scaffolds']):
        if f is not None:
            logging.info("{0} is {1}; downloading".format(name, f))
            download_file(f, tmp_dir)
            cmd_string = cmd_string + name + ' ' + os.path.join(
                tmp_dir, os.path.basename(f)) + ' '

    # 2) Unzip if need be
    to_unzip = glob.glob(tmp_dir + '/*.gz')
    if len(to_unzip) > 0:
        for g in to_unzip:
            cmd = 'gzip -d {0}'.format(g)
            subprocess.check_call(shlex.split(cmd))

            # Alter the command string to fit the unzip
            pre = os.path.basename(g) + " "
            post = pre[:-4] + " "
            cmd_string = cmd_string.replace(pre, post)

    # Get the work directory
    wd_loc = os.path.join(working_dir, args.wd_name)
    cmd_string = cmd_string + ' -o ' + wd_loc

    return cmd_string, wd_loc