コード例 #1
0
ファイル: _format.py プロジェクト: qiyunzhu/q2-types
class CasavaOneEightLanelessPerSampleDirFmt(model.DirectoryFormat):
    sequences = model.FileCollection(r'.+_.+_R[12]_001\.fastq\.gz',
                                     format=FastqGzFormat)

    @sequences.set_path_maker
    def sequences_path_maker(self, sample_id, barcode_id, read_number):
        return '%s_%s_R%d_001.fastq.gz' % (sample_id, barcode_id, read_number)
コード例 #2
0
class JSONDirectory(model.DirectoryFormat):
    manifest = model.File("manifest.csv", format=ModelManifest)
    json_files = model.FileCollection(r".+\.json", format=JSONFormat)

    @json_files.set_path_maker
    def sbml_path_maker(self, model_id):
        return "%s.json" % model_id
コード例 #3
0
class SBMLDirectory(model.DirectoryFormat):
    manifest = model.File("manifest.csv", format=ModelManifest)
    sbml_files = model.FileCollection(r".+\.xml", format=SBMLFormat)

    @sbml_files.set_path_maker
    def sbml_path_maker(self, model_id):
        return "%s.xml" % model_id
コード例 #4
0
class CommunityModelDirectory(model.DirectoryFormat):
    manifest = model.File("manifest.csv", format=CommunityModelManifest)
    model_files = model.FileCollection(r".+\.pickle",
                                       format=CommunityModelFormat)

    @model_files.set_path_maker
    def model_path_maker(self, model_id):
        return "%s.pickle" % model_id
コード例 #5
0
ファイル: _format.py プロジェクト: paulinetrinh/q2-types
class CasavaOneEightSingleLanePerSampleDirFmt(model.DirectoryFormat):
    _CHECK_PAIRED = True
    _REQUIRE_PAIRED = False

    sequences = model.FileCollection(
        r'.+_.+_L[0-9][0-9][0-9]_R[12]_001\.fastq\.gz',
        format=FastqGzFormat)

    @sequences.set_path_maker
    def sequences_path_maker(self, sample_id, barcode_id, lane_number,
                             read_number):
        return '%s_%s_L%03d_R%d_001.fastq.gz' % (sample_id, barcode_id,
                                                 lane_number, read_number)

    def _find_duplicates(self, ids):
        return {x for x, c in collections.Counter(ids).items() if c > 1}

    def _validate_(self, level):
        forwards = []
        reverse = []
        for p in self.path.iterdir():
            if p.is_dir():
                # This branch happens if you have a filepath that looks roughly
                # like: Human_Kneecap/S1_L001_R1_001.fastq.gz
                # This technically matches the regex. It's easier to just
                # check that there aren't any directories, than making a very
                # complicated regex. This also produces a nicer error anyways.
                d = p.relative_to(self.path)
                raise ValidationError("Contains a subdirectory: %s" % d)
            else:
                if p.name.endswith('_001.fastq.gz'):
                    sample_id = p.name.rsplit('_', maxsplit=4)[0]
                    if p.name.endswith('R1_001.fastq.gz'):
                        forwards.append(sample_id)
                    else:
                        reverse.append(sample_id)

        set_forwards = set(forwards)
        set_reverse = set(reverse)

        if len(set_forwards) != len(forwards):
            raise ValidationError('Duplicate samples in forward reads: %r'
                                  % self._find_duplicates(forwards))
        if len(set_reverse) != len(reverse):
            raise ValidationError('Duplicate samples in reverse reads: %r'
                                  % self._find_duplicates(reverse))

        if forwards and reverse:
            if not self._CHECK_PAIRED:
                raise ValidationError("Forward and reverse reads found.")
            elif set_forwards ^ set_reverse:
                raise ValidationError(
                    "These samples do not have matching pairs of forward and "
                    "reverse reads: %r" % (set_forwards ^ set_reverse))
        elif self._REQUIRE_PAIRED:
            raise ValidationError("Reads are not paired end.")
コード例 #6
0
class FourIntsDirectoryFormat(model.DirectoryFormat):
    """
    A sequence of exactly four integers stored across multiple files, some of
    which are in a nested directory. Each file contains a single integer.
    Since this is a sequence, the integers have an order (corresponding to
    filename) and repetition of elements is allowed.

    """
    single_ints = model.FileCollection(r'file[1-2]\.txt|nested/file[3-4]\.txt',
                                       format=SingleIntFormat)

    @single_ints.set_path_maker
    def single_ints_path_maker(self, num):
        if not 0 < num < 5:
            raise ValueError("`num` must be 1-4, not %r." % num)
        if num > 2:
            return 'nested/file%d.txt' % num
        else:
            return 'file%d.txt' % num
コード例 #7
0
ファイル: _format.py プロジェクト: mikerobeson/q2-sourmash
class MinHashSigJsonDirFormat(model.DirectoryFormat):
    signatures = model.FileCollection(r'.*\.sig', format=MinHashSigJson)

    @signatures.set_path_maker
    def signature_path_maker(self, name):
        return (name + '.sig')
コード例 #8
0
ファイル: _format.py プロジェクト: qiyunzhu/q2-types
class CasavaOneEightSingleLanePerSampleDirFmt(model.DirectoryFormat):
    _CHECK_PAIRED = True
    _REQUIRE_PAIRED = False

    sequences = model.FileCollection(
        r'.+_.+_L[0-9][0-9][0-9]_R[12]_001\.fastq\.gz', format=FastqGzFormat)

    @sequences.set_path_maker
    def sequences_path_maker(self, sample_id, barcode_id, lane_number,
                             read_number):
        return '%s_%s_L%03d_R%d_001.fastq.gz' % (sample_id, barcode_id,
                                                 lane_number, read_number)

    def _find_duplicates(self, ids):
        return {x for x, c in collections.Counter(ids).items() if c > 1}

    @property
    def manifest(self):
        tmp_manifest = FastqManifestFormat()
        with tmp_manifest.open() as fh:
            fh.write('sample-id,filename,direction\n')
            for fp, _ in self.sequences.iter_views(FastqGzFormat):
                sample_id, _, _, _, direction = _parse_casava_filename(fp)
                fh.write('%s,%s,%s\n' % (sample_id, fp.name, direction))

        df = _manifest_to_df(tmp_manifest, self.path.parent)

        if 'reverse' not in df:
            df['reverse'] = None

        if 'forward' not in df:
            df['forward'] = None

        def munge_fn_closure(val):
            if val is not None:
                return str(self.path / pathlib.Path(val).name)
            return val

        for column in {'forward', 'reverse'}:
            df[column] = df[column].apply(munge_fn_closure)

        return df

    def _validate_(self, level):
        forwards = []
        reverse = []
        for p in self.path.iterdir():
            if p.is_dir():
                # This branch happens if you have a filepath that looks roughly
                # like: Human_Kneecap/S1_L001_R1_001.fastq.gz
                # This technically matches the regex. It's easier to just
                # check that there aren't any directories, than making a very
                # complicated regex. This also produces a nicer error anyways.
                d = p.relative_to(self.path)
                raise ValidationError("Contains a subdirectory: %s" % d)
            else:
                if p.name.endswith('_001.fastq.gz'):
                    sample_id = p.name.rsplit('_', maxsplit=4)[0]
                    if p.name.endswith('R1_001.fastq.gz'):
                        forwards.append(sample_id)
                    else:
                        reverse.append(sample_id)

        set_forwards = set(forwards)
        set_reverse = set(reverse)

        if len(set_forwards) != len(forwards):
            raise ValidationError('Duplicate samples in forward reads: %r' %
                                  self._find_duplicates(forwards))
        if len(set_reverse) != len(reverse):
            raise ValidationError('Duplicate samples in reverse reads: %r' %
                                  self._find_duplicates(reverse))

        if forwards and reverse:
            if not self._CHECK_PAIRED:
                raise ValidationError("Forward and reverse reads found.")
            elif set_forwards ^ set_reverse:
                raise ValidationError(
                    "These samples do not have matching pairs of forward and "
                    "reverse reads: %r" % (set_forwards ^ set_reverse))
        elif self._REQUIRE_PAIRED:
            raise ValidationError("Reads are not paired end.")
コード例 #9
0
class FASTAFilesDirFmt(model.DirectoryFormat):
    fastas = model.FileCollection(r'.+\.fasta', format=DNAFASTAFormat)

    @fastas.set_path_maker
    def fastas_path_maker(self, name):
        return name + '.fasta'
コード例 #10
0
class SAMFilesDirFmt(model.DirectoryFormat):
    sams = model.FileCollection(r'.+\.sam', format=SAMFormat)

    @sams.set_path_maker
    def sams_path_maker(self, name):
        return name + '.sam'
コード例 #11
0
class BAMFilesDirFmt(model.DirectoryFormat):
    bams = model.FileCollection(r'.+\.bam', format=BAMFormat)

    @bams.set_path_maker
    def bams_path_maker(self, name):
        return name + '.bam'
コード例 #12
0
class PileUpFilesDirFmt(model.DirectoryFormat):
    pileups = model.FileCollection(r'.+\.tsv', format=PileUpTSVFormat)

    @pileups.set_path_maker
    def pileups_path_maker(self, name):
        return name + '.tsv'