Exemple #1
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, outpath):
        """extract the files that are gpr format"""
        import os
        import shutil
        from Betsy import gpr_module
        from Betsy import module_utils

        directory = module_utils.unzip_if_zip(antecedents.identifier)
        x = os.listdir(directory)
        x = [x for x in x if x != ".DS_Store"]
        files = x
        assert files, 'The input folder or zip file is empty.'

        files = [
            x for x in files
            if gpr_module.check_gpr(os.path.join(directory, x))
        ]
        assert files, 'There are no gpr files in the input.'

        if not os.path.exists(outpath):
            os.mkdir(outpath)
        for file_ in files:
            x1 = os.path.join(directory, file_)
            x2 = os.path.join(outpath, file_)
            shutil.copyfile(x1, x2)
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, outfile):
        """extract the fastq rna seq files"""
        import os
        import shutil
        from genomicode import filelib
        from Betsy import module_utils
        in_data = antecedents
        directory = module_utils.unzip_if_zip(in_data.identifier)
        filenames = os.listdir(directory)
        assert filenames, 'The input folder or zip file is empty.'
        if not os.path.exists(outfile):
            os.mkdir(outfile)

        format_types = ['fa', 'fastq']
        for format_type in format_types:
            for filename in filenames:
                if filename == '.DS_Store':
                    continue
                fileloc = os.path.join(in_data.identifier, filename)
                if fileloc.endswith(format_type + '.gz'):
                    newfname = os.path.splitext(filename)[0]
                    new_file = module_utils.gunzip(fileloc)
                elif fileloc.endswith(format_type):
                    new_file = fileloc
                    newfname = filename
                    shutil.copyfile(new_file, os.path.join(outfile, newfname))
                if fileloc.endswith('.gz'):
                    os.remove(new_file)

        assert filelib.exists_nz(outfile), (
            'the output file %s for extract_rna_files_fastq fails' % outfile)
    def set_out_attributes(self, antecedents, out_attributes):
        import os
        import shutil
        from Betsy import module_utils
        directory = module_utils.unzip_if_zip(antecedents.identifier)
        filenames = os.listdir(directory)
        if directory != antecedents.identifier:
            shutil.rmtree(directory)

        assert filenames, 'The input folder or zip file is empty.'
        format_types = ['fa', 'fastq']
        flag = []
        for format_type in format_types:
            for filename in filenames:
                if filename == '.DS_Store':
                    continue
                if filename.endswith(format_type + '.gz'):
                    flag.append(True)
                elif filename.endswith(format_type):
                    flag.append(True)
                else:
                    flag.append(False)

        if True in flag:
            out_attributes['format_type'] = 'fastqfolder'
            return out_attributes

        out_attributes['format_type'] = 'not_fastqfolder'
        return out_attributes
Exemple #4
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, outfile):
        import os
        import subprocess
        from Betsy import module_utils
        from genomicode import config
        in_data = antecedents
        directory = module_utils.unzip_if_zip(in_data.identifier)
        filenames = os.listdir(directory)
        assert filenames, 'The input folder or zip file is empty.'
        if not os.path.exists(outfile):
            os.mkdir(outfile)

        samtools_BIN = config.samtools
        assert os.path.exists(
            samtools_BIN), 'cannot find the %s' % samtools_BIN
        for filename in filenames:
            infile = os.path.join(directory, filename)
            outname = os.path.splitext(filename)[-2] + '_sorted.bam'
            outname = os.path.join(outfile, outname)
            command = [samtools_BIN, 'sort', infile, outname]

            process = subprocess.Popen(command,
                                       shell=False,
                                       stdout=subprocess.PIPE,
                                       stderr=subprocess.PIPE)
            process.wait()
            error_message = process.communicate()
            if 'error' in error_message[1]:
                raise ValueError(error_message)
            assert module_utils.exists_nz(outname), (
                'the output file %s for sort_bam_folder does not exist' %
                outname)
Exemple #5
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, outfile):
        """extract the cel files with cc or v3_4"""
        import os
        import shutil
        from Betsy import module_utils
        from genomicode import affyio
        from genomicode import filelib
        in_data = antecedents
        directory = module_utils.unzip_if_zip(in_data.identifier)
        filenames = os.listdir(directory)
        assert filenames, 'The input folder or zip file is empty.'
        ver_list = []
        if not os.path.exists(outfile):
            os.mkdir(outfile)

        for filename in filenames:
            if filename == '.DS_Store':
                pass
            else:
                fileloc = os.path.join(directory, filename)
                cel_v = affyio.guess_cel_version(fileloc)
                if cel_v in ['cc1', 'v3', 'v4']:
                    shutil.copyfile(fileloc, os.path.join(outfile, filename))
                    ver_list.append(True)
                else:
                    ver_list.append(False)

        if True in ver_list:
            assert filelib.exists_nz(outfile), (
                'the output file %s for extract_CEL_files fails' % outfile)
        else:
            assert ValueError('There is no cel file in the input.')
Exemple #6
0
    def set_out_attributes(self, antecedents, out_attributes):
        import os
        import shutil
        from Betsy import module_utils
        in_data = antecedents
        directory = module_utils.unzip_if_zip(in_data.identifier)
        filenames = os.listdir(directory)
        if directory != antecedents.identifier:
            shutil.rmtree(directory)

        assert filenames, 'The input folder or zip file is empty.'
        format_type = 'sam'
        flag = []
        for filename in filenames:
            if filename == '.DS_Store':
                continue
            fileloc = os.path.join(in_data.identifier, filename)
            if fileloc.endswith(format_type + '.gz'):
                flag.append(True)
            elif fileloc.endswith(format_type):
                flag.append(True)
            else:
                flag.append(False)

        if True in flag:
            out_attributes['format_type'] = 'samfolder'
            return out_attributes

        out_attributes['format_type'] = 'not_samfolder'
        return out_attributes
Exemple #7
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, outfile):
        import os
        import shutil
        from Betsy import module_utils
        from genomicode import filelib
        in_data = antecedents
        directory = module_utils.unzip_if_zip(in_data.identifier)
        agilent_files = []
        filenames = os.listdir(directory)
        assert filenames, 'The input folder or zip file is empty.'
        for filename in filenames:
            if filename in ['.DS_Store', '._.DS_Store', '.Rapp.history']:
                continue
            if os.path.isdir(os.path.join(directory, filename)):
                continue
            postag = []
            fline = []
            f = open(os.path.join(directory, filename), 'r')
            for i in range(10):
                line = f.readline()
                words = line.split()
                if len(words) > 0:
                    postag.append(words[0])
                    if words[0] == 'FEATURES':
                        fline = set(words)
            f.close()
            signal_tag = set(['gProcessedSignal', 'rProcessedSignal'])
            if signal_tag.issubset(fline):
                if postag == [
                        'TYPE', 'FEPARAMS', 'DATA', '*', 'TYPE', 'STATS',
                        'DATA', '*', 'TYPE', 'FEATURES'
                ]:
                    agilent_files.append(filename)

        if agilent_files:
            if not os.path.exists(outfile):
                os.mkdir(outfile)
            for filename in agilent_files:
                old_file = os.path.join(directory, filename)
                new_file = os.path.join(outfile, filename)
                shutil.copyfile(old_file, new_file)
            assert filelib.exists_nz(outfile), (
                'the output file %s for extract_agilent_files fails' % outfile)
        else:
            raise ValueError('There is no agilent file in the input.')
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, outfile):
        """extract the matrix file from the expression files"""
        import os
        from Betsy import module_utils
        from genomicode import filelib
        #from genomicode import affyio
        in_data = antecedents
        directory = module_utils.unzip_if_zip(in_data.identifier)
        filenames = os.listdir(directory)
        assert filenames, 'The input folder or zip file is empty.'
        for filename in filenames:
            if 'series_matrix.txt' in filename:
                fileloc = os.path.join(directory, filename)
                outname = os.path.join(outfile, filename)
                extract_expression_file(fileloc, outname)

        assert filelib.exists_nz(outfile), (
            'the output file %s for extract_matrix_file fails' % outfile)
Exemple #9
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, outfile):
        import os
        import shutil
        from Betsy import module_utils

        in_data = antecedents
        out_path = outfile
        if not os.path.exists(out_path):
            os.mkdir(out_path)

        in_path = module_utils.unzip_if_zip(in_data.identifier)
        assert in_path == in_data.identifier
        filenames = os.listdir(in_path)
        assert filenames, "The input folder or zip file is empty."

        x = guess_datatype(in_path)
        datatype, filenames = x
        for in_filename in filenames:
            in_path, in_file = os.path.split(in_filename)
            out_filename = os.path.join(out_path, in_file)
            shutil.copyfile(in_filename, out_filename)
Exemple #10
0
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            out_path):
        import os
        import shutil
        from genomicode import filelib
        from Betsy import module_utils

        path = module_utils.unzip_if_zip(in_data.identifier)
        x = filelib.list_files_in_path(path)
        x = [x for x in x if x.lower().endswith(".idat")]
        assert x, "No idat files."
        in_filenames = x

        if not os.path.exists(out_path):
            os.mkdir(out_path)
        for in_filename in in_filenames:
            in_path, in_file = os.path.split(in_filename)
            file_, ext = os.path.splitext(in_file)
            if file_.endswith("_Grn"):
                file_ = file_[:-4]
            out_file = "%s%s" % (file_, ext)
            out_filename = os.path.join(out_path, out_file)
            shutil.copyfile(in_filename, out_filename)
Exemple #11
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, outfile):
        import os
        import subprocess
        from Betsy import module_utils
        from genomicode import config
        in_data = antecedents
        directory = module_utils.unzip_if_zip(in_data.identifier)
        filenames = os.listdir(directory)
        assert filenames, 'The input folder or zip file is empty.'
        if not os.path.exists(outfile):
            os.mkdir(outfile)

        mark_duplicates_path = config.Mark_duplicates
        assert os.path.exists(
            mark_duplicates_path), 'cannot find the %s' % mark_duplicates_path
        for filename in filenames:
            infile = os.path.join(directory, filename)
            outname = os.path.splitext(filename)[-2] + '.bam'
            outname = os.path.join(outfile, outname)
            command = [
                'java', '-Xmx5g', '-jar', mark_duplicates_path, 'I=' + infile,
                'O=' + outname, 'METRICS_FILE=metricsFile',
                'VALIDATION_STRINGENCY=LENIENT', 'REMOVE_DUPLICATES=true'
            ]
            process = subprocess.Popen(command,
                                       shell=False,
                                       stdout=subprocess.PIPE,
                                       stderr=subprocess.PIPE)
            process.wait()
            error_message = process.communicate()
            if 'error' in error_message[1]:
                raise ValueError(error_message)
            assert module_utils.exists_nz(outname), (
                'the output file %s for flag_dups_in_bam_folder does not exist'
                % outname)