Beispiel #1
0
def filterOutSequencesBatch(taxonIdSet, srcDir, dstDir, notAllowedSeqIdSet):
    """
        For each fasta file that is in directory srcDir filters out sequences that are not defined in the allowedSeqIdSet.
    """
    for taxonId in taxonIdSet:
        srcFilePath = os.path.join(srcDir,str(str(taxonId) + '.1.fna'))
        dstFilePath = os.path.join(dstDir,str(str(taxonId) + '.1.fna'))

        seqIdDict = fasta.getSequenceToBpDict(srcFilePath)
        allowedNamesSet = set()
        for id in seqIdDict.iterkeys():
            if id not in notAllowedSeqIdSet:
                allowedNamesSet.add(id)

        fasta.filterOutSequences(srcFilePath, dstFilePath, allowedNamesSet)
Beispiel #2
0
def filterOutSequencesBatch(taxonIdSet, srcDir, dstDir, notAllowedSeqIdSet):
    """
        For each fasta file that is in directory srcDir filters out sequences that are not defined in the allowedSeqIdSet.
    """
    for taxonId in taxonIdSet:
        srcFilePath = os.path.join(srcDir, str(str(taxonId) + '.1.fna'))
        dstFilePath = os.path.join(dstDir, str(str(taxonId) + '.1.fna'))

        seqIdDict = fasta.getSequenceToBpDict(srcFilePath)
        allowedNamesSet = set()
        for id in seqIdDict.iterkeys():
            if id not in notAllowedSeqIdSet:
                allowedNamesSet.add(id)

        fasta.filterOutSequences(srcFilePath, dstFilePath, allowedNamesSet)
def filterSequences():
    """
        To filter sequences with a specific label.
    """
    inFileName = '/net/metagenomics/projects/PPSmg/data/V35/contigsMappedBlast1000.fna'
    outFileName = '/net/metagenomics/projects/PPSmg/data/V35/nostocRemoved/contigsMappedBlast1000NostocRm.fna'
    mapFileName = '/net/metagenomics/projects/PPSmg/data/V35/contigsMappedBlast1000Labels.txt'
    labelRemove = 103690
    #seq id -> label
    labelToIdsDict = csv.getMapping(mapFileName, 1, 0, sep='\t', comment='#')
    allowedNamesSet = set()
    for i in labelToIdsDict:
        if int(i) != int(labelRemove):
            for j in labelToIdsDict[i]:
                allowedNamesSet.add(j)

    fas.filterOutSequences(inFileName, outFileName, allowedNamesSet)
Beispiel #4
0
def filterSequences():
    """
        To filter sequences with a specific label.
    """
    inFileName = '/net/metagenomics/projects/PPSmg/data/V35/contigsMappedBlast1000.fna'
    outFileName = '/net/metagenomics/projects/PPSmg/data/V35/nostocRemoved/contigsMappedBlast1000NostocRm.fna'
    mapFileName = '/net/metagenomics/projects/PPSmg/data/V35/contigsMappedBlast1000Labels.txt'
    labelRemove = 103690
    #seq id -> label
    labelToIdsDict = csv.getMapping(mapFileName, 1, 0, sep='\t', comment = '#')
    allowedNamesSet = set()
    for i in labelToIdsDict:
        if int(i) != int(labelRemove):
            for j in labelToIdsDict[i]:
                allowedNamesSet.add(j)

    fas.filterOutSequences(inFileName, outFileName, allowedNamesSet)
def maskDb(action,
           inDir,
           outDir,
           rank,
           clades,
           taxonomyFilePath,
           verbose=False):
    """
        Main function (function interface), see module description.

        @param action: one action that will be performed [cl, mr, mg] ~ (generate list, mask seq, mask mg)
        @type action str
        @param inDir: directory containing input files
        @type inDir: str
        @param outDir: directory containing output files
        @type: outDir: str
        @param rank: the data will be excluded at this rank
        @type rank: str
        @param clades: a file containing clades that will be masked (one ncbi taxon id at a line),
            or a set of ncbi taxon ids that will be masked
        @type clades: file or set of int
        @param taxonomyFilePath: taxonomy database file in the sqlite3 format
        @type taxonomyFilePath: str
    """
    # check input parameters
    assert action in ['cl', 'mr',
                      'mg'], str('Given action is not supported: ' + action)
    if action == 'mr':
        assert os.name == 'posix', 'Symbolic links can be created only on posix systems, action "mr" is not valid!'
    for dir in [inDir, outDir]:
        assert os.path.isdir(dir), str("Directory doesn't exists: " + dir)
    assert rank in _RANKS, str('Not supported rank: ' + rank)
    assert os.path.isfile(taxonomyFilePath), str(
        "Taxonomy database file doesn't exist: " + taxonomyFilePath)
    assert isinstance(
        clades, set
    ) or (isinstance(clades, str) and os.path.isfile(clades)), str(
        "Parameter 'clades' can be either a file or a set of ncbi taxonIds to be excluded."
    )

    # maps a rank to a lower rank
    toLowerRank = {}
    for i in range(1, len(_RANKS)):
        toLowerRank[_RANKS[i - 1]] = _RANKS[i]

    taxonomy = _TaxonomyWrapMD(taxonomyFilePath)

    # leaf clades to mask
    if isinstance(clades, set):
        inCladesSet = set(map(int, clades))
    else:
        inCladesSet = set(map(int, csv.getColumnAsList(clades)))

    # clades in the reference
    refCladesSet = set()
    if action in ['cl', 'mr']:
        # get the list of all taxon ids that appear in the directory (as PPS reference)
        for fastaFilePath in glob.glob(
                os.path.join(os.path.normpath(inDir),
                             r'*.f[na][as]')):  # *.fas or *.fna
            refCladesSet.add(_refFilePathToTaxonId(
                fastaFilePath))  # taxonId.1.fna or taxonId.1.fas
    elif action in ['mg']:
        # get the list of all taxon ids that appear in any file in the input directory as taxonomy ".tax"
        for mapFilePath in glob.glob(
                os.path.join(os.path.normpath(inDir), r'*.tax')):  # *.tax
            refCladesSet.update(
                set(
                    map(_mgSeqIdToTaxonId,
                        csv.getColumnAsList(mapFilePath, sep='\t'))))
    else:
        assert False, str('Not supported action: ' + action)

    # checks whether taxonIds are in the taxonomy
    for taxonId in inCladesSet:
        assert taxonomy.exists(taxonId), str(
            'taxonId: %s from clades list is not contained in the taxonomy!' %
            taxonId)
    for taxonId in refCladesSet:
        assert taxonomy.exists(taxonId), str(
            'taxonId: %s from the reference is not contained in the taxonomy!'
            % taxonId)

    # checks whether the taxonIds are leafs (doesn't have to be (unless you want to mask at the strain level))
    for taxonId in inCladesSet:
        if not taxonomy.isLeaf(taxonId):
            print(
                'Taxon id %s does not represent a leaf clade in the taxonomy.'
                % taxonId)

    if verbose:
        print('Initial checks done.')

    # taxonIds that should be excluded
    toExcludeSet = set()
    for taxonId in inCladesSet:
        taxonIdAtRank = taxonomy.getTaxonIdAtRank(taxonId, rank)
        if taxonIdAtRank is None:  # the lineage is not defined at this rank ! try a lower rank !
            print('Taxon id: "%s" is not defined at rank: "%s"' %
                  (taxonId, rank))
            currentRank = rank  # find a lower rank at which it's defined
            while currentRank in toLowerRank:
                currentRank = toLowerRank[currentRank]
                taxonIdAtRank = taxonomy.getTaxonIdAtRank(taxonId, currentRank)
                if taxonIdAtRank is not None:
                    break
            if taxonIdAtRank is None:
                taxonIdAtRank = taxonId
                currentRank = _STRAIN
            print('Taxon id: %s will be masked at rank: %s' %
                  (taxonId, currentRank))

        # all child clades (and itself)
        toExcludeSet.add(int(taxonIdAtRank))
        toExcludeSet.update(
            set(map(int, taxonomy.getAllChildren(taxonIdAtRank))))

    # all clades that should be excluded (there is at least one sequence for each taxonId in the reference)
    toExcludeSet.intersection_update(refCladesSet)
    if verbose:
        print('Data to mask collected done.')

    print('To exclude: ', len(toExcludeSet))

    # exclude data from the reference
    if action == 'cl':
        # generates a list of taxonIds
        out = csv.OutFileBuffer(os.path.join(outDir, 'exclude_list.txt'))
        for taxonId in toExcludeSet:
            out.writeText(str(taxonId) + '\n')
        out.close()
    elif action == 'mr':
        # masked reference sequences (create sim links to files that were not excluded)
        for fastaFilePath in glob.glob(
                os.path.join(os.path.normpath(inDir),
                             r'*.f[na][as]')):  # *.fas or *.fna
            taxonId = _refFilePathToTaxonId(
                fastaFilePath)  # taxonId.1.fna or taxonId.1.fas
            if taxonId not in toExcludeSet:
                # assert os.name == 'posix'
                os.symlink(
                    fastaFilePath,
                    os.path.join(outDir, os.path.basename(fastaFilePath)))
    elif action == 'mg':
        # exclude sequences from the marker gene databases
        for mapFilePath in glob.glob(
                os.path.join(os.path.normpath(inDir), r'*.tax')):

            # get entries that can stay in the mapping and fasta files
            allowedEntriesSet = set(
                map(_mgSeqIdToTaxonId,
                    csv.getColumnAsList(mapFilePath, sep='\t')))
            allowedEntriesSet.difference_update(toExcludeSet)

            # filter out entries from the mapping file
            csv.filterOutLines(mapFilePath,
                               os.path.join(outDir,
                                            os.path.basename(mapFilePath)),
                               allowedEntriesSet,
                               entryModifyFunction=_mgSeqIdToTaxonId,
                               colNum=0,
                               sep='\t')

            # filter out entries from the fasta file
            fastaFilePath = str(mapFilePath.rsplit('.', 1)[0] + '.fna')
            fas.filterOutSequences(fastaFilePath,
                                   os.path.join(
                                       outDir,
                                       os.path.basename(fastaFilePath)),
                                   allowedEntriesSet,
                                   seqNameModifyFunction=_mgSeqIdToTaxonId)
    else:
        assert False, 'Not supported action!'

    taxonomy.close()
    if verbose:
        print('Data masked done.')
Beispiel #6
0
def maskDb(action, inDir, outDir, rank, clades, taxonomyFilePath, verbose=False):
    """
        Main function (function interface), see module description.

        @param action: one action that will be performed [cl, mr, mg] ~ (generate list, mask seq, mask mg)
        @type action str
        @param inDir: directory containing input files
        @type inDir: str
        @param outDir: directory containing output files
        @type: outDir: str
        @param rank: the data will be excluded at this rank
        @type rank: str
        @param clades: a file containing clades that will be masked (one ncbi taxon id at a line),
            or a set of ncbi taxon ids that will be masked
        @type clades: file or set of int
        @param taxonomyFilePath: taxonomy database file in the sqlite3 format
        @type taxonomyFilePath: str
    """
    # check input parameters
    assert action in ['cl', 'mr', 'mg'], str('Given action is not supported: ' + action)
    if action == 'mr':
        assert os.name == 'posix', 'Symbolic links can be created only on posix systems, action "mr" is not valid!'
    for dir in [inDir, outDir]:
        assert os.path.isdir(dir), str("Directory doesn't exists: " + dir)
    assert rank in _RANKS, str('Not supported rank: ' + rank)
    assert os.path.isfile(taxonomyFilePath), str("Taxonomy database file doesn't exist: " + taxonomyFilePath)
    assert isinstance(clades, set) or (isinstance(clades, str) and os.path.isfile(clades)), str(
        "Parameter 'clades' can be either a file or a set of ncbi taxonIds to be excluded.")

    # maps a rank to a lower rank
    toLowerRank = {}
    for i in range(1, len(_RANKS)):
        toLowerRank[_RANKS[i-1]] = _RANKS[i]

    taxonomy = _TaxonomyWrapMD(taxonomyFilePath)

    # leaf clades to mask
    if isinstance(clades, set):
        inCladesSet = set(map(int, clades))
    else:
        inCladesSet = set(map(int, csv.getColumnAsList(clades)))

    # clades in the reference
    refCladesSet = set()
    if action in ['cl', 'mr']:
        # get the list of all taxon ids that appear in the directory (as PPS reference)
        for fastaFilePath in glob.glob(os.path.join(os.path.normpath(inDir), r'*.f[na][as]')):  # *.fas or *.fna
            refCladesSet.add(_refFilePathToTaxonId(fastaFilePath))  # taxonId.1.fna or taxonId.1.fas
    elif action in ['mg']:
        # get the list of all taxon ids that appear in any file in the input directory as taxonomy ".tax"
        for mapFilePath in glob.glob(os.path.join(os.path.normpath(inDir), r'*.tax')):  # *.tax
            refCladesSet.update(set(map(_mgSeqIdToTaxonId, csv.getColumnAsList(mapFilePath, sep='\t'))))
    else:
        assert False, str('Not supported action: ' + action)

    # checks whether taxonIds are in the taxonomy
    for taxonId in inCladesSet:
        assert taxonomy.exists(taxonId), str(
            'taxonId: %s from clades list is not contained in the taxonomy!' % taxonId)
    for taxonId in refCladesSet:
        assert taxonomy.exists(taxonId), str(
            'taxonId: %s from the reference is not contained in the taxonomy!' % taxonId)

    # checks whether the taxonIds are leafs (doesn't have to be (unless you want to mask at the strain level))
    for taxonId in inCladesSet:
        if not taxonomy.isLeaf(taxonId):
            print('Taxon id %s does not represent a leaf clade in the taxonomy.' % taxonId)

    if verbose:
        print('Initial checks done.')

    # taxonIds that should be excluded
    toExcludeSet = set()
    for taxonId in inCladesSet:
        taxonIdAtRank = taxonomy.getTaxonIdAtRank(taxonId, rank)
        if taxonIdAtRank is None:  # the lineage is not defined at this rank ! try a lower rank !
            print('Taxon id: "%s" is not defined at rank: "%s"' % (taxonId, rank))
            currentRank = rank  # find a lower rank at which it's defined
            while currentRank in toLowerRank:
                currentRank = toLowerRank[currentRank]
                taxonIdAtRank = taxonomy.getTaxonIdAtRank(taxonId, currentRank)
                if taxonIdAtRank is not None:
                    break
            if taxonIdAtRank is None:
                taxonIdAtRank = taxonId
                currentRank = _STRAIN
            print('Taxon id: %s will be masked at rank: %s' % (taxonId, currentRank))

        # all child clades (and itself)
        toExcludeSet.add(int(taxonIdAtRank))
        toExcludeSet.update(set(map(int, taxonomy.getAllChildren(taxonIdAtRank))))

    # all clades that should be excluded (there is at least one sequence for each taxonId in the reference)
    toExcludeSet.intersection_update(refCladesSet)
    if verbose:
        print('Data to mask collected done.')

    print('To exclude: ', len(toExcludeSet))

    # exclude data from the reference
    if action == 'cl':
        # generates a list of taxonIds
        out = csv.OutFileBuffer(os.path.join(outDir, 'exclude_list.txt'))
        for taxonId in toExcludeSet:
            out.writeText(str(taxonId) + '\n')
        out.close()
    elif action == 'mr':
        # masked reference sequences (create sim links to files that were not excluded)
        for fastaFilePath in glob.glob(os.path.join(os.path.normpath(inDir), r'*.f[na][as]')):  # *.fas or *.fna
            taxonId = _refFilePathToTaxonId(fastaFilePath)  # taxonId.1.fna or taxonId.1.fas
            if taxonId not in toExcludeSet:
                # assert os.name == 'posix'
                os.symlink(fastaFilePath, os.path.join(outDir, os.path.basename(fastaFilePath)))
    elif action == 'mg':
        # exclude sequences from the marker gene databases
        for mapFilePath in glob.glob(os.path.join(os.path.normpath(inDir), r'*.tax')):

            # get entries that can stay in the mapping and fasta files
            allowedEntriesSet = set(map(_mgSeqIdToTaxonId, csv.getColumnAsList(mapFilePath, sep='\t')))
            allowedEntriesSet.difference_update(toExcludeSet)

            # filter out entries from the mapping file
            csv.filterOutLines(mapFilePath, os.path.join(outDir, os.path.basename(mapFilePath)),
                               allowedEntriesSet, entryModifyFunction=_mgSeqIdToTaxonId, colNum=0, sep='\t')

            # filter out entries from the fasta file
            fastaFilePath = str(mapFilePath.rsplit('.', 1)[0] + '.fna')
            fas.filterOutSequences(fastaFilePath, os.path.join(outDir, os.path.basename(fastaFilePath)),
                                   allowedEntriesSet, seqNameModifyFunction=_mgSeqIdToTaxonId)
    else:
        assert False, 'Not supported action!'

    taxonomy.close()
    if verbose:
        print('Data masked done.')