def test_paired_files(self):
        """ Test the paired files function """

        files = ["s1.R1.fastq", "s1.R2.fastq", "s2.R1.fastq", "s2.R2.fastq"]

        expected_pairs = [["s1.R1.fastq", "s2.R1.fastq"],
                          ["s1.R2.fastq", "s2.R2.fastq"]]

        actual_pairs = utilities.paired_files(files,
                                              ".fastq",
                                              pair_identifier=".R1")

        self.assertEqual(expected_pairs[0], actual_pairs[0])
        self.assertEqual(expected_pairs[1], actual_pairs[1])
    def test_paired_files_identifier_not_found(self):
        """ Test the paired files function with an identifier that is not found"""

        files = [
            "sample-1.R1.fastq", "sample-1.R2.fastq", "sample-2.R1.fastq",
            "sample-2.R2.fastq"
        ]

        expected_pairs = [[], []]

        actual_pairs = utilities.paired_files(files,
                                              ".fastq",
                                              pair_identifier="_R1.")

        self.assertEqual(expected_pairs[0], actual_pairs[0])
        self.assertEqual(expected_pairs[1], actual_pairs[1])
    def test_paired_files_identifier_includes_extension(self):
        """ Test the paired files function with an identifier that is not found because
            it includes the period from the file extension"""

        files = [
            "sample-1.R1.fastq", "sample-1.R2.fastq", "sample-2.R1.fastq",
            "sample-2.R2.fastq"
        ]

        expected_pairs = [[], []]

        actual_pairs = utilities.paired_files(files,
                                              ".fastq",
                                              pair_identifier="R1.")

        self.assertEqual(expected_pairs[0], actual_pairs[0])
        self.assertEqual(expected_pairs[1], actual_pairs[1])
    def test_paired_files_duplicate_identifier_3(self):
        """ Test the paired files function with pair identified duplicated """

        files = [
            "MR100.R1.fastq", "MR100.R2.fastq", "MR200.R1.fastq",
            "MR200.R2.fastq"
        ]

        expected_pairs = [["MR100.R1.fastq", "MR200.R1.fastq"],
                          ["MR100.R2.fastq", "MR200.R2.fastq"]]

        actual_pairs = utilities.paired_files(files,
                                              "fastq",
                                              pair_identifier="R1")

        self.assertEqual(expected_pairs[0], actual_pairs[0])
        self.assertEqual(expected_pairs[1], actual_pairs[1])
    def test_paired_files_duplicate_identifier_2(self):
        """ Test the paired files function with a second identifier duplicated
            Also test extension without leading period. """

        files = [
            "s_2_1.fastq", "s_2_2.fastq", "s_2_3_1.fastq", "s_2_3_2.fastq"
        ]

        expected_pairs = [["s_2_1.fastq", "s_2_3_1.fastq"],
                          ["s_2_2.fastq", "s_2_3_2.fastq"]]

        actual_pairs = utilities.paired_files(files,
                                              "fastq",
                                              pair_identifier="_1")

        self.assertEqual(expected_pairs[0], actual_pairs[0])
        self.assertEqual(expected_pairs[1], actual_pairs[1])
    def test_paired_files_duplicate_identifier_1(self):
        """ Test the paired files function with a first identifier duplicated"""

        files = [
            "s_1_1.fastq.gz", "s_1_2.fastq.gz", "s_1_3_1.fastq.gz",
            "s_1_3_2.fastq.gz"
        ]

        expected_pairs = [["s_1_1.fastq.gz", "s_1_3_1.fastq.gz"],
                          ["s_1_2.fastq.gz", "s_1_3_2.fastq.gz"]]

        actual_pairs = utilities.paired_files(files,
                                              ".fastq.gz",
                                              pair_identifier="_1")

        self.assertEqual(expected_pairs[0], actual_pairs[0])
        self.assertEqual(expected_pairs[1], actual_pairs[1])
예제 #7
0
def main(workflow):
    args = workflow.parse_args()
    conf = parse_cfg_file(args.config_file, section='HG')

    ## Parse the manifest file containing all data files from this submission
    manifest = parse_cfg_file(args.manifest_file)
    project = manifest.get('project')
    data_files = manifest.get('submitted_files')
    submission_date = manifest.get('submission_date')

    if data_files and data_files.get('HG', {}).get('input'):
        input_files = data_files.get('HG').get('input')

        project_dirs = create_project_dirs([
            conf.get('deposition_dir'),
            conf.get('processing_dir'),
            conf.get('public_dir')
        ], project, submission_date, 'HG')

        deposited_files = stage_files(workflow,
                                      input_files,
                                      project_dirs[0],
                                      symlink=True)
        fastq_files = bam_to_fastq(workflow,
                                   input_files,
                                   project_dirs[1],
                                   paired_end=True,
                                   threads=args.threads,
                                   compress=False)
        paired_fastq_files = paired_files(fastq_files, '_R1')

        paired_fastq_tars = []
        for (mate_1, mate_2) in zip(paired_fastq_files[0],
                                    paired_fastq_files[1]):
            sample_name = sample_names(mate_4, pair_identifier="_R1")
            tar_path = os.path.join(project_dirs[-1], "%s.tar" % sample_name)
            paired_fastq_tar = tar_files(workflow, [mate_1, mate_2],
                                         tar_path,
                                         depends=[mate_1, mate_2])
            paired_fastq_tars.append(paired_fastq_tar)

        md5sum_files = generate_md5_checksums(workflow, paired_fastq_tars)

        workflow.go()
예제 #8
0
    args.input_extension = args.input_extension.replace(".gz", "")
    args.input_extension = args.input_extension.replace(".bz2", "")
else:
    # if the input files are fasta, bypass quality control
    qc_output_files = demultiplexed_files

### STEP #2: Run taxonomic profiling on all of the filtered files ###
if not args.bypass_taxonomic_profiling:
    merged_taxonomic_profile, taxonomy_tsv_files, taxonomy_sam_files = shotgun.taxonomic_profile(
        workflow, qc_output_files, args.output, args.threads,
        args.input_extension)

elif not args.bypass_functional_profiling or not args.bypass_strain_profiling:
    # get the names of the taxonomic profiling files allowing for pairs
    input_pair1, input_pair2 = utilities.paired_files(demultiplexed_files,
                                                      original_extension,
                                                      args.pair_identifier)
    sample_names = utilities.sample_names(
        input_pair1 if input_pair1 else input_files, original_extension,
        args.pair_identifier)
    tsv_profiles = utilities.name_files(sample_names,
                                        demultiplex_output_folder,
                                        tag="taxonomic_profile",
                                        extension="tsv")
    # check all of the expected profiles are found
    if len(tsv_profiles) != len(list(filter(os.path.isfile, tsv_profiles))):
        sys.exit(
            "ERROR: Bypassing taxonomic profiling but all of the tsv taxonomy profile files are not found in the input folder. Expecting the following input files:\n"
            + "\n".join(tsv_profiles))
    # run taxonomic profile steps bypassing metaphlan2
    merged_taxonomic_profile, taxonomy_tsv_files, taxonomy_sam_files = shotgun.taxonomic_profile(
예제 #9
0
def demultiplex_dual(workflow, output_folder, input_files, extension,
            barcode_files, dual_barcode_path, min_phred, pair_identifier):

    """Demultiplex the files (dual indexed paired)

        Args:
            workflow (anadama2.workflow): An instance of the workflow class.
            input_files (list): A list of paths to fastq(gz) files for input to ea-utils.
            extension (string): The extension for all files.
            output_folder (string): The path of the output folder.
            barcode_files (list): A list of barcode files.
            dual_index_path (string): A paths to the dual index file.
            min_phred (int): The min phred quality score to use in the demultiplex command.
            pair_identifier (string): The string in the file basename to identify
                the first pair in the set.

        Requires:
            ea-utils fastq-multx: A tool to demultiplex fastq files.

        Returns:
            list: A list of the demultiplexed files
            string: output folder of demultiplexed files

        """

    # capture the demultiplex stats in log file, one for each set of input files
    demultiplex_log = utilities.name_files(input_files[0],output_folder,subfolder="demultiplex",extension="log",create_folder=True)
    demultiplex_output_folder = os.path.dirname(demultiplex_log)

    # create a tracked executable
    fastq_multx_tracked = TrackedExecutable("fastq-multx",
                                            version_command="echo 'fastq-multx' `fastq-multx 2>&1 | grep Version`")

    # check for paired input files
    input_pair1, input_pair2 = utilities.paired_files(input_files, extension, pair_identifier)

    # get barcode files
    barcode1, barcode2 = utilities.paired_files(barcode_files, extension, pair_identifier)

    # get the second pair identifier
    pair_identifier2 = pair_identifier.replace("1", "2", 1)

    try:
        file_handle = open(dual_barcode_path)
        lines = file_handle.readlines()
        file_handle.close()
    except EnvironmentError:
        sys.exit("ERROR: Unable to read dual barcode file: " + dual_barcode_path)

    run_name = os.path.basename(input_pair1[0]).replace(pair_identifier, "").replace("." + extension, "")
    demultiplex_files = set()
    for line in lines:
        # ignore headers or comment lines
        if not line.startswith("#"):
            sample_name = line.split("\t")[0]

            if sample_name:
                nm1 = demultiplex_output_folder + "/" + run_name + "_" + sample_name + pair_identifier + "." + extension
                nm2 = demultiplex_output_folder + "/" + run_name + "_" + sample_name + pair_identifier2 + "." + extension
                demultiplex_files.add(nm1)
                demultiplex_files.add(nm2)

    # get the names of the expected output files
    # demultiplex_files = utilities.name_files(samples, demultiplex_output_folder, extension=extension)

    workflow.add_task(
        "fastq-multx -B [depends[0]] [depends[1]] [depends[2]] [depends[3]] [depends[4]]\
         -o n/a -o n/a -o [args[0]]/[args[5]]_%[args[3]].[args[1]] -o [args[0]]/[args[5]]_%[args[4]].[args[1]]\
         -q [args[2]] > [targets[0]]",
        depends=[dual_barcode_path, barcode1[0], barcode2[0], input_pair1[0], input_pair2[0]],
        args=[demultiplex_output_folder, extension, min_phred, pair_identifier, pair_identifier2, run_name, fastq_multx_tracked],
        targets=[demultiplex_log, TrackedDirectory(demultiplex_output_folder)],
        name="demultiplex_dual")

    demultiplex_files = demultiplex_check(workflow, demultiplex_log, demultiplex_files)


    return demultiplex_files, demultiplex_output_folder
예제 #10
0
def demultiplex(workflow, input_files, extension, output_folder, barcode_file, index_files, min_phred, pair_identifier):
    """Demultiplex the files (single end or paired)
    
    Args:
        workflow (anadama2.workflow): An instance of the workflow class.
        input_files (list): A list of paths to fastq files for input to ea-utils.
        extension (string): The extension for all files.
        output_folder (string): The path of the output folder.
        barcode_file (string): A file of barcodes.
        index_files (string): A list of paths to the index files.
        min_phred (int): The min phred quality score to use in the demultiplex command.
        pair_identifier (string): The string in the file basename to identify
            the first pair in the set.
        
    Requires:
        ea-utils fastq-multx: A tool to demultiplex fastq files.
        
    Returns:
        list: A list of the demultiplexed files
        string: output folder of demultiplexed files
        
    """
    
    # error if there is more than one index file
    if len(index_files) > 1:
        sys.exit("ERROR: Only one index file expected for demultiplexing step.")
    
    # read the barcode file to get the expected output files 
    try:
        file_handle=open(barcode_file)
        lines=file_handle.readlines()
        file_handle.close()
    except EnvironmentError:
        sys.exit("ERROR: Unable to read barcode file: " + barcode_file)
        
    samples=set()
    for line in lines:
        # ignore headers or comment lines
        if not line.startswith("#"):
            sample_name=line.rstrip().split("\t")[0]
            if sample_name:
                samples.add(sample_name)
            
    # get the names of the expected output files
    demultiplex_fastq_files = utilities.name_files(samples,output_folder,subfolder="demultiplex",extension="fastq")
    
    # name the barcode file with the reverse complement barcodes added
    expanded_barcode_file = utilities.name_files("expanded_barcode_file.txt",output_folder,subfolder="demultiplex",create_folder=True)
    
    # create a file that includes the reverse complements of the barcodes
    workflow.add_task(
        "reverse_compliment_barcodes.py --input [depends[0]] --output [targets[0]]",
        depends=barcode_file,
        targets=expanded_barcode_file)
    
    # check for paired input files
    input_pair1, input_pair2 = utilities.paired_files(input_files, extension, pair_identifier)
    
    # capture the demultiplex stats in output files, one for each set of input files
    if input_pair1:
        demultiplex_log = utilities.name_files(input_pair1[0],output_folder,subfolder="demultiplex",extension="log")
    else:
        demultiplex_log = utilities.name_files(input_files[0],output_folder,subfolder="demultiplex",extension="log")
        
    # get the output folder for all files
    demultiplex_output_folder = os.path.dirname(demultiplex_log)
    
    # get the basenames of the output files, one for each sample
    demultiplex_output_basenames = utilities.name_files(samples,output_folder,subfolder="demultiplex")
    
    # create a tracked executable
    fastq_multx_tracked = TrackedExecutable("fastq-multx",version_command="echo 'fastq-multx' `fastq-multx 2>&1 | grep Version`")
    
    if input_pair1 and input_pair2:
        # this run has paired input files
        # get the second pair identifier
        pair_identifier2=pair_identifier.replace("1","2",1)
        # get the names of the expected output files
        demultiplex_fastq_files_R1 = [file+pair_identifier+".fastq" for file in demultiplex_output_basenames]
        demultiplex_fastq_files_R2 = [file+pair_identifier2+".fastq" for file in demultiplex_output_basenames]
        demultiplex_fastq_files = demultiplex_fastq_files_R1+demultiplex_fastq_files_R2
        
        if index_files:
            # this run has index files
            workflow.add_task(
                "fastq-multx -l [depends[0]] [depends[1]] [depends[2]] [depends[3]] -o [args[1]]/%_I1_001.fastq [args[1]]/%[args[2]].fastq [args[1]]/%[args[3]].fastq -q [args[0]] > [targets[0]]",
                depends=[expanded_barcode_file, index_files[0], input_pair1[0], input_pair2[0], fastq_multx_tracked],
                args=[min_phred, demultiplex_output_folder, pair_identifier, pair_identifier2],
                targets=demultiplex_log,
                name="demultiplex")
            
        else:
            workflow.add_task(
                "fastq-multx -l [depends[0]] [depends[1]] [depends[2]] -o [args[1]]/%[args[2]].fastq [args[1]]/%[args[3]].fastq -q [args[0]] > [targets[0]]",
                depends=[expanded_barcode_file, input_pair1[0], input_pair2[0], fastq_multx_tracked],
                args=[min_phred, demultiplex_output_folder, pair_identifier, pair_identifier2],
                targets=demultiplex_log,
                name="demultiplex")
        
    else:
        # this run has single end input files
        # get the names of the expected output files
        demultiplex_fastq_files = [file+pair_identifier+".fastq" for file in demultiplex_output_basenames]
        
        if index_files:
            # this run has index files
            workflow.add_task(
                "fastq-multx -l [depends[0]] [depends[1]] [depends[2]] -o [args[1]]/%_I1_001.fastq [args[1]]/%[args[2]].fastq -q [args[0]] > [targets[0]]",
                depends=[expanded_barcode_file, index_files[0], input_files[0], fastq_multx_tracked],
                args=[min_phred, demultiplex_output_folder, pair_identifier],
                targets=demultiplex_log,
                name="demultiplex")
            
        else:
            workflow.add_task(
                "fastq-multx -l [depends[0]] [depends[1]] -o [args[1]]/%[args[2]].fastq -q [args[0]] > [targets[0]]",
                depends=[expanded_barcode_file, input_files[0]],
                args=[min_phred, demultiplex_output_folder, pair_identifier, fastq_multx_tracked],
                targets=demultiplex_log,
                name="demultiplex")

    demultiplex_fastq_files = demultiplex_check(workflow, demultiplex_log, demultiplex_fastq_files)


    return demultiplex_fastq_files, demultiplex_output_folder
예제 #11
0
    def _generate_metadata_file(task):
        input_files = [seq_file.name for seq_file in task.depends[:-3]]
        studytrax_metadata = task.depends[-4].name
        broad_sample_sheet = task.depends[-3].name
        auxillary_metadata = task.depends[-1].name if task.depends[-1].name != '/dev/null' else None
        metadata_out_file = task.targets[0].name

        data_type_map = config.get('dtype_mapping')

        studytrax_df = pd.read_csv(studytrax_metadata)
        broad_sample_df = pd.read_csv(broad_sample_sheet, 
                                      na_values=['destroyed', 'missed'],
                                      parse_dates=['Actual Date of Receipt'])

        collection_dates_dict = m_utils.get_collection_dates(broad_sample_df)

        if pair_identifier:
            (input_pair1, input_pair2) = bb_utils.paired_files(input_files, pair_identifier)
            input_files = input_pair1 if input_pair1 else input_files

        sample_mapping = dict(zip(bb_utils.sample_names(input_files, pair_identifier),
                               map(get_sample_id_from_fname, input_files)))        
        sample_ids = [sid.replace(pair_identifier, '') for sid in sample_mapping.values()]

        sample_subset_df = broad_sample_df[(broad_sample_df['Parent Sample A'].isin(sample_ids)) |
                                           (broad_sample_df['Proteomics'].isin(sample_ids)) |
                                           (broad_sample_df['MbX'].isin(sample_ids)) |
                                           (broad_sample_df['Site/Sub/Coll']).isin(sample_ids)]        

        metadata_df = sample_subset_df.merge(studytrax_df,
                                            left_on='Parent Sample A',
                                            right_on='st_q4',
                                            how='left')

        ## We sometimes get a situation where our studytrax metadata is missing
        ## some of the proteomics sample ID's so we need to make sure we replicate
        ## them
        metadata_df.loc[metadata_df['st_q17'].isnull(), 'st_q17'] = metadata_df['Proteomics']
        metadata_df.loc[metadata_df['st_q11'].isnull(), 'st_q11'] = metadata_df['MbX']
        metadata_df['data_type'] = data_type_map.get(data_type)

        if proteomics_metadata:
            proteomics_df = m_utils.add_proteomics_metadata(sample_subset_df, 
                                                            proteomics_metadata,
                                                            sample_mapping)
            metadata_df = metadata_df.merge(proteomics_df,
                                            on='Parent Sample A',
                                            how='left')


        metadata_df['External ID'] = new_metadata_df.apply(generate_external_id, axis=1)

        metadata_df['Site/Sub/Coll ID'] = metadata_df['Site/Sub/Coll'].map(lambda sid: str(sid))
        metadata_df['Site'] = metadata_df['SiteName']
        metadata_df['Participant ID'] = metadata_df['Subject'].map(lambda subj: 'C' + str(subj))
        metadata_df['visit_num'] = metadata_df['Collection #']
        metadata_df['Research Project'] = config.get('research_project')
        metadata_df['Project'] = metadata_df.apply(m_utils.get_project_id, axis=1)
        metadata_df = generate_collection_statistics(metadata_df,
                                                     collection_dates_dict)
        metadata_df = metadata_df.drop(config.get('drop_cols'), axis=1, inplace=True)
 
        if auxillary_metadata:
            ## Auxillary metadata are columns that will be added into our
            ## existing metadata rows. 
            metadata_df = m_utils.add_auxiliary_metadata(metadata_df,auxillary_metadata)

        metadata_df.to_csv(metadata_out_file, index=False)
                      desc="the path to the run_dbcan.py script",
                      default="/app/")

# get the arguments from the command line
args = workflow.parse_args()

# get all input files with the input extension provided on the command line
# return an error if no files are found
input_files = utilities.find_files(args.input,
                                   extension=args.input_extension,
                                   exit_if_not_found=True)

### STEP #1: Run quality control on all input files ###
sample_names = utilities.sample_names(input_files, args.input_extension)
input_pair1, input_pair2 = utilities.paired_files(input_files,
                                                  args.input_extension,
                                                  args.pair_identifier)
paired = False
if input_pair1:
    sample_names = utilities.sample_names(input_pair1, args.input_extension,
                                          args.pair_identifier)
    qc_targets = [
        utilities.name_files([
            name + ".trimmed.1.fastq", name + ".trimmed.2.fastq",
            name + ".trimmed.single.1.fastq", name + ".trimmed.single.2.fastq",
            name + ".trimmed.single.12.fastq"
        ],
                             args.output,
                             subfolder="kneaddata",
                             create_folder=True) for name in sample_names
    ]
예제 #13
0
def main(workflow):
    args = workflow.parse_args()

    conf_mtx = parse_cfg_file(args.config_file, section='MTX')
    conf_mgx = parse_cfg_file(args.config_file, section='MGX')
    manifest = parse_cfg_file(args.manifest_file)

    data_files = manifest.get('submitted_files')
    project = manifest.get('project')
    creation_date = manifest.get('submission_date')
    adapters_file = manifest.get('adapters_file')

    contaminate_db = conf_mtx.get('databases').get('knead_dna')
    mtx_db = conf_mtx.get('databases').get('knead_mtx')
    rrna_db = conf_mtx.get('databases').get('knead_rrna')
    adapter_sequences = conf_mtx.get('adapter_sequences')

    qc_threads = args.threads_kneaddata if args.threads_kneaddata else args.threads
    tax_threads = args.threads_metaphlan if args.threads_metaphlan else args.threads
    func_threads = args.threads_humann if args.threads_humann else args.threads

    if data_files and data_files.get('MTX', {}).get('input'):
        input_files_mtx = data_files.get('MTX').get('input')
        file_extension_mtx = data_files.get('MTX').get('input_extension', '.fastq')
        pair_identifier_mtx = data_files.get('MTX').get('pair_identifier')
        input_file_tags = data_files.get('MTX').get('tags')
        input_tax_profiles = []

        project_dirs_mtx = create_project_dirs([conf_mtx.get('deposition_dir'),
                                                conf_mtx.get('processing_dir'),
                                                conf_mtx.get('public_dir')],
                                               project,
                                               creation_date,
                                               'MTX')
        public_dir_mtx = project_dirs_mtx[-1]
        base_depo_dir = os.path.abspath(os.path.join(project_dirs_mtx[0], '..'))

        manifest_file = stage_files(workflow, 
                                    [args.manifest_file],
                                    base_depo_dir)
        deposited_files_mtx = stage_files(workflow,
                                          input_files_mtx,
                                          project_dirs_mtx[0],
                                          symlink=True)

        if file_extension_mtx == ".bam":
            ## Need to sort our BAM files to be sure here...
            paired_end_seqs = bam_to_fastq(workflow, 
                                            deposited_files_mtx, 
                                            project_dirs_mtx[1],
                                            paired_end=True,
                                            compress=False,
                                            threads=args.threads)
            pair_identifier_mtx = "_R1"                                            
        else:
            paired_end_seqs = deposited_files_mtx

        if adapters_file:
            adapter_trim_opts = (" --trimmomatic-options \"ILLUMINACLIP:%s:2:30:10:8:TRUE "
                                 "SLIDINGWINDOW:4:20 MINLEN:50\"" % adapters_file)

        (cleaned_fastqs_mtx, read_counts_mtx) = quality_control(workflow,
                                                                paired_end_seqs,
                                                                file_extension_mtx,
                                                                project_dirs_mtx[1],
                                                                qc_threads,
                                                                databases=[contaminate_db,
                                                                           rrna_db,
                                                                           mtx_db],
                                                                pair_identifier=pair_identifier_mtx,
                                                                additional_options=adapter_trim_opts,
                                                                remove_intermediate_output=True)

        sample_names_mtx = sample_names(cleaned_fastqs_mtx, file_extension_mtx)

        ##########################################
        #          MGX FILE PROCESSING           #
        ##########################################
        # Ideally we would be passed in a set of corresponding metagenome
        # sequence(s) to go with our metatranscriptomic files but we also
        # have two other scenarios:
        #
        #       1.) No accompanying metagenomic sequences exist; in this
        #           case we will proceed just using the metatranscriptomic
        #           data.
        #       2.) Taxonomic profiles are passed directly in in our MANIFEST
        #           file; here we remove these from our input files and
        #           prevent them from running through the kneaddata ->
        #           metaphlan2 portions of our pipeline
        if data_files.get('MGX', {}).get('input'):
            input_files_mgx = data_files.get('MGX').get('input')
            file_extension_mgx = data_files.get('MGX').get('file_ext')
            pair_identifier_mgx = data_files.get('MGX').get('pair_identifier')
            input_tax_profiles = [in_file for in_file in input_files_mgx
                                  if 'taxonomic_profile.tsv' in in_file]
            input_files_mgx = set(input_files_mgx) - set(input_tax_profiles)

            if input_files_mgx:
                sample_names_mgx = sample_names(input_files_mgx, file_extension_mgx, file_extension_mgx)

                project_dirs_mgx = create_project_dirs([conf_mgx.get('deposition_dir'),
                                                        conf_mgx.get('processing_dir'),
                                                        conf_mgx.get('public_dir')],
                                                       project,
                                                       creation_date,
                                                       'WGS')
                public_dir_mgx = project_dirs_mgx[-1]

                deposited_files_mgx = stage_files(workflow,
                                                  input_files_mgx,
                                                  project_dirs_mgx[0],
                                                  symlink=True)

                if file_extension_mgx == ".bam":
                    ## Need to sort our BAM files to be sure here...
                    paired_end_seqs = bam_to_fastq(workflow, 
                                                    deposited_files_mgx, 
                                                    project_dirs_mgx[1],
                                                    paired_end=True,
                                                    compress=False,
                                                    threads=args.threads)
                    pair_identifier_mgx = "_R1"                                            
                else:
                    paired_end_seqs_mgx = paired_files(deposited_files_mgx, pair_identifier_mgx)  

                (cleaned_fastqs_mgx, read_counts_mgx) = quality_control(workflow,
                                                                        paired_end_seqs_mgx,
                                                                        project_dirs_mgx[1],
                                                                        qc_threads,
                                                                        [contaminate_db,
                                                                        rrna_db],
                                                                        remove_intermediate_output=True)

                tax_outs_mgx = taxonomic_profile(workflow,
                                                 cleaned_fastqs_mgx,
                                                 project_dirs_mgx[1],
                                                 tax_threads,
                                                 '*.fastq')

                func_outs_mgx = functional_profile(workflow,
                                                   cleaned_fastqs_mgx,
                                                   project_dirs_mgx[1],
                                                   func_threads,
                                                   tax_outs_mgx[1],
                                                   remove_intermediate_output=True)
                input_tax_profiles.extend(tax_outs_mgx[1])

                pub_wgs_raw_dir = os.path.join(public_dir_mgx, 'raw')
                pub_wgs_tax_profile_dir = os.path.join(public_dir_mgx, 'tax_profile')
                pub_wgs_func_profile_dir = os.path.join(public_dir_mgx, 'func_profile')
                map(create_folders, [pub_wgs_raw_dir, pub_wgs_tax_profile_dir,
                                    pub_wgs_func_profile_dir])

                norm_genefamilies_mgx = name_files(sample_names,
                                                project_dirs_mgx[1],
                                                subfolder='genes',
                                                tag='genefamilies_relab',
                                                extension='tsv')
                norm_ecs_files_mgx = name_files(sample_names,
                                                project_dirs_mgx[1],
                                                subfolder='ecs',
                                                tag='genefamilies_ecs_relab',
                                                extension='tsv')
                norm_path_files_mgx = name_files(sample_names,
                                                project_dirs_mgx[1],
                                                subfolder='pathways',
                                                tag='pathabundance_relab',
                                                extension='tsv')

                pcl_files = add_metadata_to_tsv(workflow,
                                                [tax_outs_mgx[1]] 
                                                + func_outs_mgx,
                                                'metagenomics',
                                                conf_mgx.get('metadata_id_col'),
                                                conf_mgx.get('analysis_col_patterns'),
                                                conf_mgx.get('target_metadata_cols'))
                                      
                func_tar_files_wgs = []
                for (sample, gene_file, ecs_file, path_file) in zip(sample_names_mgx,
                                                                    norm_genefamilies_mgx,
                                                                    norm_ecs_files_mgx,
                                                                    norm_path_files_mgx):
                    tar_path = os.path.join(pub_wgs_func_profile_dir, 
                                            "%s_humann2.tgz" % sample)
                    func_tar_file = tar_files(workflow,
                                            [gene_file, ecs_file, path_file],
                                            tar_path,
                                            depends=func_outs_mgx)
                    func_tar_files_wgs.append(func_tar_file)

        ##########################################
        #          MTX FILE PROCESSING           #
        ##########################################
        # Here we want to see if we can create a set of matching cleaned
        # MTX files to corresponding MGX taxonomic profiles. If these exist
        # we want to run functional profiling wit hthe corresponding MGX
        # taxonomic profile otherwise we will run a taxonomic profiling
        # on the MTX sequences and run functional profiling with the produced
        # taxonomic profile.
        func_outs_match_mtx = []
        if input_tax_profiles:
            (matched_fqs, matched_tax_profiles) = match_tax_profiles(cleaned_fastqs_mtx,
                                                                     '.fastq',
                                                                     data_files.get('MTX').get('metadata_id_col', 'External ID'),
                                                                     input_tax_profiles,
                                                                     data_files.get('MGX').get('tax_profile_id', 'External ID'),
                                                                     args.metadata_file,
                                                                     tags=input_file_tags)

            func_outs_match_mtx = functional_profile(workflow,
                                                     matched_fqs,
                                                     project_dirs_mtx[1],
                                                     func_threads,
                                                     matched_tax_profiles,
                                                     remove_intermediate_output=True)

            # Reset the remaining MTX files left over here so that we can run them through
            # the metaphlan2 -> humann2 pipeline.
            cleaned_fastqs_mtx = set(cleaned_fastqs_mtx) - set(matched_fqs)

        if cleaned_fastqs_mtx:
            tax_outs_mtx = taxonomic_profile(workflow,
                                             cleaned_fastqs_mtx,
                                             project_dirs_mtx[1],
                                             tax_threads,
                                             '*.fastq')
            func_outs_mtx = functional_profile(workflow,
                                               cleaned_fastqs_mtx,
                                               file_extension_mtx,
                                               project_dirs_mtx[1],
                                               func_threads,
                                               tax_outs_mtx[1],
                                               remove_intermediate_output=True)
            func_outs_mtx = list(func_outs_mtx).extend(func_outs_match_mtx)
        else:
            func_outs_mtx = func_outs_match_mtx

        # We'll need to generate DNA/RNA normalized files to be displayed 
        # in our visualization output.
        (norm_gene_ratio, norm_ecs_ratio, norm_path_ratio) = norm_ratio(workflow,
                                                                        func_outs_mgx[0],
                                                                        func_outs_mgx[1],
                                                                        func_outs_mgx[2],
                                                                        func_outs_mtx[0],
                                                                        func_outs_mtx[1],
                                                                        func_outs_mtx[2],
                                                                        project_dirs_mtx[1])

        pub_mtx_raw_dir = os.path.join(public_dir_mtx, 'raw')
        pub_mtx_tax_profile_dir = os.path.join(public_dir_mtx, 'tax_profile')
        pub_mtx_func_profile_dir = os.path.join(public_dir_mtx, 'func_profile')
        map(create_folders, [pub_mtx_raw_dir, pub_mtx_tax_profile_dir,
                             pub_mtx_func_profile_dir])

        norm_genefamilies_mtx = name_files(sample_names_mtx,
                                           project_dirs_mtx[1],
                                           subfolder='genes',
                                           tag='genefamilies_relab',
                                           extension='tsv')
        norm_ecs_files_mtx = name_files(sample_names_mtx,
                                        project_dirs_mtx[1],
                                        subfolder='ecs',
                                        tag='genefamilies_ecs_relab',
                                        extension='tsv')
        norm_path_files_mtx = name_files(sample_names_mtx,
                                         project_dirs_mtx[1],
                                         subfolder='pathways',
                                         tag='pathabundance_relab',
                                         extension='tsv')

        func_tar_files_mtx = []
        for (sample, gene_file, ecs_file, path_file) in zip(sample_names_mtx,
                                                            norm_genefamilies_mtx,
                                                            norm_ecs_files_mtx,
                                                            norm_path_files_mtx):
            tar_path = os.path.join(pub_mtx_func_profile_dir,
                                    "%s_humann2.tgz" % sample)
            func_tar_file = tar_files(workflow,
                                      [gene_file, ecs_file, path_file],
                                      tar_path,
                                      depends=func_outs_mtx)
            func_tar_files_mtx.append(func_tar_file)
    
        workflow.go()
def main(args):
    config = parse_cfg_file(args.config)

    study_trax_df = pd.read_csv(args.studytrax_metadata, dtype='str')
    broad_sample_df = pd.read_csv(args.broad_sample_tracking,
                                  na_values=['destroyed', 'missed'],
                                  parse_dates=['Actual Date of Receipt'])
    proteomics_df = None
    metadata_df = None
    new_metadata_df = None

    date_today = datetime.date.today()
    metadata_file = os.path.join(args.output_dir,
                                 'hmp2_metadata_%s.csv' % date_today)

    ## Before we filter our metadata rows down to just to rows associated
    ## with the files we have present, we'll want a list of all the collection
    ## dates
    collection_dates_dict = get_collection_dates(broad_sample_df)

    biopsy_date_map = None
    if args.proteomics_metadata:
        proteomics_df = pd.read_table(args.proteomics_metadata)
    if args.biopsy_dates:
        biopsy_date_map = parse_biopsy_dates(args.biopsy_dates)

    ## The update procedure either assumes that we have an exisitng metadata
    ## file that we are going to be appending too/updating or that we are
    ## creating a fresh metadata sheet and will be adding the files in the
    ## manifest file too it.
    ## TODO: This needs to be re-worked to account for snagging datatypes as as well.
    #if not args.metadata_file or args.refresh_all:
    #    sequence_files.extend(get_all_sequence_files(config.get('deposition_dir'),
    #                                                 config.get('input_extensions')))
    if args.manifest_file:
        manifest = parse_cfg_file(args.manifest_file)
        submitted_files = manifest.get('submitted_files')

        if submitted_files:
            new_metadata = []
            for (dtype, items) in submitted_files.iteritems():
                input_files = items.get('input')
                pair_identifier = items.get('pair_identifier')

                if pair_identifier:
                    (input_pair1, input_pair2) = bb_utils.paired_files(
                        input_files, pair_identifier)
                    input_files = input_pair1 if input_pair1 else input_files

                else:
                    new_metadata.append(
                        get_metadata_rows(config, study_trax_df,
                                          broad_sample_df, proteomics_df,
                                          dtype, input_files, pair_identifier))

            new_metadata_df = pd.concat(new_metadata, ignore_index=True)

            #new_metadata_df[new_metadata_df['External ID'].isnull()] = None
            new_metadata_df['Site/Sub/Coll ID'] = new_metadata_df[
                'Site/Sub/Coll'].map(lambda sid: str(sid))
            #new_metadata_df['Participant ID'] = new_metadata_df['Subject'].map(lambda subj: 'C' + str(subj))
            if 'Collection #' in new_metadata_df.columns:
                new_metadata_df['visit_num'] = new_metadata_df['Collection #']
            new_metadata_df['Project'] = new_metadata_df.apply(get_project_id,
                                                               axis=1)
            new_metadata_df['ProjectSpecificID'] = pd.to_numeric(
                new_metadata_df['ProjectSpecificID'])
            new_metadata_df['Site'] = new_metadata_df['SiteName']
            new_metadata_df = new_metadata_df.apply(generate_external_id,
                                                    axis=1)

            new_metadata_df = remove_columns(new_metadata_df,
                                             config.get('drop_cols'))

    if args.metadata_file:
        metadata_df = pd.read_csv(args.metadata_file,
                                  parse_dates=['Actual Date of Receipt'])

        site_mapping = config.get('site_map')
        metadata_df['Site/Sub/Coll ID'] = metadata_df.apply(
            fix_site_sub_coll_id, args=(site_mapping, ), axis=1)
        metadata_df['PDO Number'] = metadata_df.apply(get_pdo_number, axis=1)

        if new_metadata_df and not new_metadata_df.empty:
            metadata_df = pd.concat([metadata_df, new_metadata_df],
                                    ignore_index=True)
            metadata_df = metadata_df.drop_duplicates(
                subset=['External ID', 'Site/Sub/Coll ID', 'data_type'],
                keep='last')
    else:
        metadata_df = new_metadata_df

    metadata_df[metadata_df['External ID'].isnull()] = metadata_df[
        metadata_df['External ID'].isnull()].apply(generate_external_id,
                                                   axis=1)

    if args.auxillary_metadata:
        for aux_file in args.auxillary_metadata:
            supp_df = pd.read_table(aux_file)
            supp_columns = supp_df.columns.tolist()

            idx_offset = 1
            if 'data_type' in supp_columns:
                join_id = supp_columns[:1] + ['data_type']
                idx_offset = 2
            else:
                join_id = supp_columns[0]

            ## We need to do this in two stages. If the columns already exist
            ## here we want to update them. If they do not exist we append
            ## them.
            metadata_cols = metadata_df.columns.tolist()
            new_cols = set(supp_columns[idx_offset:]) - set(metadata_cols)
            existing_cols = set(
                supp_columns[idx_offset:]).intersection(metadata_cols)

            if new_cols:
                supp_new_df = supp_df.filter(items=supp_columns[:idx_offset] +
                                             list(new_cols))
                metadata_df = metadata_df.merge(supp_new_df,
                                                how='left',
                                                on=join_id)

            if existing_cols:
                supp_existing_df = supp_df.filter(
                    items=supp_columns[:idx_offset] + list(existing_cols))
                metadata_df.set_index(join_id, inplace=True)
                supp_existing_df.set_index(join_id, inplace=True)

                metadata_df.update(supp_existing_df)
                metadata_df.reset_index(inplace=True)

    if args.add_all_stool_collections:
        metadata_df = add_all_stool_collections(metadata_df, study_trax_df,
                                                broad_sample_df)

    metadata_df['Actual Date of Receipt'] = pd.to_datetime(
        metadata_df['Actual Date of Receipt'])
    metadata_df['visit_num'] = metadata_df.apply(fill_visit_nums, axis=1)

    metadata_df['hbi_score'] = pd.to_numeric(metadata_df['hbi_score'])
    if 'Site' in metadata_df.columns.tolist():
        metadata_df['SiteName'] = metadata_df['Site']
    else:
        metadata_df['Site'] = metadata_df['SiteName']

    ## Couple small remaining changes
    metadata_df.ix[metadata_df.hbi_score > 900, 'hbi_score'] = None
    metadata_df.ix[metadata_df.consent_age > 150, 'consent_age'] = None
    metadata_df['total_reads'].loc[metadata_df['total_reads'].astype(
        'str').str.startswith('PDO')] = None
    metadata_df['Research Project'] = "ibdmdb"

    metadata_df = generate_collection_statistics(metadata_df,
                                                 collection_dates_dict,
                                                 biopsy_date_map)
    metadata_df = add_baseline_metadata_values(metadata_df, study_trax_df,
                                               config.get('baseline_cols'))

    metadata_df[metadata_df['SiteName'].isnull()] = fix_site_name(
        metadata_df[metadata_df['SiteName'].isnull()])
    metadata_df = reorder_columns(metadata_df, config.get('col_order'))
    metadata_df.drop(['Site'], 1, inplace=True)

    metadata_df = metadata_df.sort_values(
        ['data_type', 'Participant ID', 'visit_num'])
    metadata_df.to_csv(metadata_file, index=False)
예제 #15
0
def merge_pairs_and_rename(workflow, method, input_files, extension,
                           output_folder, pair_identifier, threads,
                           fastq_ascii):
    """ Merge the files if pairs and rename sequence ids to match sample id
    
    Args:
        workflow (anadama2.workflow): An instance of the workflow class.
        method (string): tools for sequence analysis, usearch default or vsearch
        input_files (list): A list of paths to fastq files.
        extension (string): The extension for all files.
        output_folder (string): The path of the output folder.
        pair_identifier (string): The string in the file basename to identify
            the first pair in the set.
        threads (int): The number of threads for each task.
        
    Requires:
        usearch or vsearch
        
    Returns:
        list: A list of the renamed files.
        
    """

    pair1, pair2 = utilities.paired_files(input_files, extension,
                                          pair_identifier)

    if pair1 and pair2:
        # paired input files were found

        # if the files are gzipped, first decompress as fastq_mergepairs will take in fastq.gz but the output will not be correctly formatted
        if pair1[0].endswith(".gz"):
            # get the names of the decompressed output files
            decompressed_pair1 = utilities.name_files(
                [os.path.basename(file).replace(".gz", "") for file in pair1],
                output_folder,
                subfolder="merged_renamed")
            # get the names of the decompressed output files
            decompressed_pair2 = utilities.name_files(
                [os.path.basename(file).replace(".gz", "") for file in pair2],
                output_folder,
                subfolder="merged_renamed")

            # add tasks to decompress the files
            workflow.add_task_group("gunzip -c [depends[0]] > [targets[0]]",
                                    depends=pair1 + pair2,
                                    targets=decompressed_pair1 +
                                    decompressed_pair2)

            # the pair files to be used for the remaining tasks are those that are decompressed
            pair1 = decompressed_pair1
            pair2 = decompressed_pair2

        # get the sample names from the input file names
        sample_names = [
            os.path.basename(file).replace(pair_identifier + ".fastq", "")
            for file in pair1
        ]

        # get the names of the output files
        stitched_files = utilities.name_files(sample_names,
                                              output_folder,
                                              subfolder="merged_renamed",
                                              tag="stitched",
                                              extension="fastq",
                                              create_folder=True)
        unjoined_files = utilities.name_files(sample_names,
                                              output_folder,
                                              subfolder="merged_renamed",
                                              tag="unjoined",
                                              extension="fastq")

        # run usearch to merge pairs, if input files are non-empty
        for read1, read2, stitched_output, unjoined_output in zip(
                pair1, pair2, stitched_files, unjoined_files):
            if method == 'vsearch':
                workflow.add_task(
                    utilities.partial_function(merge_pairs,
                                               method="vsearch",
                                               threads=threads),
                    depends=[read1, read2,
                             TrackedExecutable("vsearch")],
                    targets=[stitched_output, unjoined_output],
                    name="vsearch_fastq_mergepairs")
            else:
                workflow.add_task(
                    utilities.partial_function(merge_pairs,
                                               method="userach",
                                               threads=threads,
                                               fastq_ascii=fastq_ascii),
                    depends=[read1, read2,
                             TrackedExecutable("usearch")],
                    targets=[stitched_output, unjoined_output],
                    name="usearch_fastq_mergepairs")

        # merge the stitched and unjoined from the prior step
        renamed_files = utilities.name_files(sample_names,
                                             output_folder,
                                             subfolder="merged_renamed",
                                             tag="renamed",
                                             extension="fastq")
        workflow.add_task_group(
            "merge_and_rename_fastq.py [depends[0]] [depends[1]] _stitched [targets[0]]",
            depends=zip(stitched_files, unjoined_files),
            targets=renamed_files)

    else:
        # these files are not pairs and do not need to be merged
        # rename the files
        renamed_files = utilities.name_files(input_files,
                                             output_folder,
                                             subfolder="merged_renamed",
                                             tag="renamed",
                                             extension="fastq",
                                             create_folder=True)
        workflow.add_task_group(
            "merge_and_rename_fastq.py [depends[0]] '' '' [targets[0]]",
            depends=input_files,
            targets=renamed_files)

    return renamed_files