コード例 #1
0
ファイル: ensembl.py プロジェクト: ulelab/iCount-Mini
def species(release=MAX_RELEASE_SUPPORTED):
    """
    Get list of available species for given ENSEMBL release.

    Parameters
    ----------
    release : int
        Release number. Only ENSEMBL releases {0}-{1} are available.

    Returns
    -------
    list
        List of species.

    """
    if release is None:
        release = MAX_RELEASE_SUPPORTED
    iCount.log_inputs(LOGGER, level=logging.INFO)

    if not MIN_RELEASE_SUPPORTED <= release <= MAX_RELEASE_SUPPORTED:
        raise ValueError(
            'Release should be a number between {} and {}.'.format(
                MIN_RELEASE_SUPPORTED, MAX_RELEASE_SUPPORTED))

    ftp = iCount.genomes.get_ftp_instance(BASE_URL)
    ftp.cwd('pub/' + 'release-' + str(release) + '/fasta/')
    spec_list = sorted([item for item in ftp.nlst()])
    ftp.quit()
    LOGGER.info('There are %d species available: %s', len(spec_list),
                ','.join(map(str, spec_list)))
    return spec_list
コード例 #2
0
ファイル: clusters.py プロジェクト: nebo56ucl/iCount
def run(sites, clusters, dist=20):  # , extend=0):
    """
    Join neighboring cross-linked sites into clusters.

    Score of cluster is the sum of all its element scores.

    Parameters
    ----------
    sites : str
        Path to input BED6 file with sites.
    clusters : str
        Path to output BED6 file with merged sites.
    dist : int
        Distance between two cross_links to still merge them.

    Returns
    -------
    str
        BED file with clusters as elements.

    """
    iCount.log_inputs(LOGGER, level=logging.INFO)

    # It is required to pre-sort your data:
    sites = pybedtools.BedTool(sites).sort().saveas()

    LOGGER.info('Merging cross links form file %s', sites)
    merged = sites.merge(s=True, d=dist, c=[5, 4], o='sum,distinct').saveas()
    out = merged.sort().each(_fix_proper_bed6_format).saveas(clusters)

    LOGGER.info('Done. Results saved to: %s', os.path.abspath(out.fn))
    return out.fn
コード例 #3
0
def chrom_length(fasta_in):
    """
    Compute chromosome lengths to a file by using samtools faidx.

    More about the .fai file format can be found here:
    http://www.htslib.org/doc/faidx.html

    Parameters
    ----------
    fasta_in : str
        Path to genome FASTA file (can be .gz).

    Returns
    -------
    str
        Absolute path to output file.

    """
    iCount.log_inputs(LOGGER, level=logging.INFO)

    temp = iCount.files.decompress_to_tempfile(fasta_in)
    command = ['samtools', 'faidx', temp]
    subprocess.check_call(command)

    # This command makes fai file. Move & rename this file to fasta_in.fai:
    fai_file_temp = temp + '.fai'
    fai_file = fasta_in + '.fai'
    subprocess.check_call(['mv', fai_file_temp, fai_file])
    LOGGER.info('Fai file saved to : %s', os.path.abspath(fai_file))
    return os.path.abspath(fai_file)
コード例 #4
0
ファイル: ensembl.py プロジェクト: ulelab/iCount-Mini
def chrom_length(fasta_in):
    """
    Compute chromosome lengths of fasta file and store them into a file.

    More about the .fai file format can be found here:
    http://www.htslib.org/doc/faidx.html

    Parameters
    ----------
    fasta_in : str
        Path to genome FASTA file (can be .gz).

    Returns
    -------
    str
        Absolute path to output file.

    """
    iCount.log_inputs(LOGGER, level=logging.INFO)

    temp = iCount.files.decompress_to_tempfile(fasta_in)
    pysam.faidx(temp)  # pylint: disable=no-member

    fai_file = os.path.abspath(fasta_in + '.fai')
    shutil.move(temp + '.fai', fai_file)
    LOGGER.info('Fai file saved to : %s', fai_file)
    return fai_file
コード例 #5
0
def merge_bed(sites_grouped, sites):
    """
    Merge multiple files with crosslinks into one.

    Concatenate files into one file. Also, merge crosslinks from different files
    that are on same position and sum their scores.

    Parameters
    ----------
    sites_grouped : str
        Path to output BED6 file containing merged data from input sites files.
    sites : list_str
        List of BED6 files(paths) to be merged.

    Returns
    -------
    str
        Absolute path to outfile.

    """
    iCount.log_inputs(LOGGER, level=logging.INFO)

    if not sites:
        raise ValueError(
            "At least one element expected in files list, but none found.")

    LOGGER.info('Reading input files...')
    joined = tempfile.NamedTemporaryFile(mode='at', delete=False)
    for file_path in sites:
        if not os.path.isfile(file_path):
            raise ValueError("File {} not found.".format(file_path))
        with iCount.files.gz_open(file_path, 'rt') as infile:
            shutil.copyfileobj(infile, joined)
    joined.close()

    # Marge intervals in "joined" file (needs to be sorted before!):
    # s=True - only merge features that are on the same strand
    # d=-1 - join only intervals with at least one base-pair overlap - default
    # (0) merges also touching intervals
    # c=5, o='sum' - when merging intervals, make operation 'sum' on column 5 (score)
    LOGGER.info('Merging files...')
    merged = pybedtools.BedTool(joined.name).sort().merge(
        s=True, d=-1, c=(5, 6), o=('sum', 'distinct')).sort().saveas()

    # Columns are now shuffled to: chrom-start-stop-strand-score
    # Reorder to: chrom-start-stop-empty_name-score-strand
    # which corresponds to BED6

    LOGGER.info('Saving results...')
    result = pybedtools.BedTool(
        pybedtools.create_interval_from_list(i[:3] + ['.'] + i[3:])
        for i in merged).saveas()
    result.saveas(sites_grouped)

    LOGGER.info('Done. Results saved to: %s', os.path.abspath(result.fn))
    return os.path.abspath(result.fn)
コード例 #6
0
ファイル: clusters.py プロジェクト: matteofloris/iCount
def run(sites, peaks, clusters, dist=20, slop=3):
    """
    Join neighboring peaks (at distance dist) into clusters.

    Report sum of sites' scores within each cluster, including slop.

    Parameters
    ----------
    sites : str
        Path to input BED6 file with sites.
    peaks : str
        Path to input BED6 file with peaks (or clusters).
    clusters : str
        Path to output BED6 file with merged peaks (clusters).
    dist : int
        Distance between two peaks to merge into same cluster.
    slop : int
        Distance between site and cluster to assign site to cluster.

    Returns
    -------
    str
        BED file with clusters as elements.

    """
    iCount.log_inputs(LOGGER, level=logging.INFO)
    metrics = iCount.Metrics()

    if slop >= dist:
        LOGGER.warning('Distance between peaks (%s) should be larger than cluster slop ('
                       '%s)', dist, slop)

    # It is required to pre-sort your data:
    LOGGER.info('Reading individual sites from %s', sites)
    bt_sites = pybedtools.BedTool(sites).sort().saveas()
    LOGGER.info('Reading peaks from %s', peaks)
    bt_peaks = pybedtools.BedTool(peaks).sort().saveas()

    LOGGER.info('Merging peaks to form clusters')
    merged = bt_peaks.merge(s=True, d=dist, c=[4], o='distinct').saveas()
    bt_merged = merged.each(_fix_bed6_zeroscore).sort().saveas()

    LOGGER.info('Summing sites within identified clusters')
    # for each site, find closest cluster to which assign the site to
    bt_selected_sites = bt_sites.closest(bt_merged, s=True, d=True, t='first', stream=True).\
                                 filter(lambda b: 0 <= int(b.fields[-1]) <= slop).saveas()
    bt_selected_sites = bt_selected_sites.each(_select_bed6_noname).sort().saveas()

    # merge selected sites and previously identified clusters
    merged = bt_selected_sites.cat(bt_merged, postmerge=True,
                                   s=True, d=slop, c=[5, 4], o='sum,distinct').saveas()
    out = merged.each(_fix_bed6).sort().saveas(clusters)

    LOGGER.info('Done. Results saved to: %s', os.path.abspath(out.fn))
    return metrics
コード例 #7
0
def bed2bedgraph(bed,
                 bedgraph,
                 name='User Track',
                 description='User Supplied Track'):
    """
    Convert from BED6 to bedGraph format.

    For further explanation of ``name`` and ``description`` parameters
    (and also others, that are not yet supported) see:
    https://genome.ucsc.edu/goldenPath/help/customTrack.html#TRACK

    Parameters
    ----------
    bed : str
        Input BED6 file.
    bedgraph : str
        Output bedGraph file.
    name : str
        Track label. Can consist of up to 15 characters.
    description : str
        Track description. Can consist of up to 60 characters.

    Returns
    -------
    None

    """
    iCount.log_inputs(LOGGER, level=logging.INFO)  # pylint: disable=protected-access
    LOGGER.info('Checking input parameters...')
    assert bed.endswith(('.bed', '.bed.gz'))

    header = [
        'track',
        'type=bedGraph',
        'name="{}"'.format(name[:15]),
        'description="{}"'.format(description[:60]),
    ]

    LOGGER.info('Converting %s to %s', bed, bedgraph)
    with open(bedgraph, 'wt') as outfile:
        outfile.write(' '.join(map(str, header)) + '\n')
        for line in pybedtools.BedTool(bed):
            bg_line = [
                line.chrom, line.start, line.stop,
                '{}{}'.format(line.strand, line.score)
            ]
            outfile.write('\t'.join(map(str, bg_line)) + '\n')
    LOGGER.info('Done.')
コード例 #8
0
def species():
    """
    Get list of available species.

    Returns
    -------
    list
        List of species.

    """
    iCount.log_inputs(LOGGER, level=logging.INFO)

    ftp = iCount.genomes.get_ftp_instance(BASE_URL)
    ftp.cwd('pub/gencode')
    spec_list = [
        item[8:] for item in ftp.nlst() if re.match(r'Gencode_\w+', item)
    ]
    ftp.quit()
    LOGGER.info('There are %d species available: %s', len(spec_list),
                ','.join(map(str, spec_list)))
    return spec_list
コード例 #9
0
ファイル: xlsites.py プロジェクト: ulelab/iCount-Mini
def run(bam, sites_single, sites_multi, skipped, group_by='start', quant='cDNA',
        segmentation=None, mapq_th=0, multimax=50, gap_th=4,
        report_progress=False):
    """
    Identify and quantify cross-linked sites.

    Interpret mapped sites and generate BED file with coordinates and
    number of cross-linked events.

    MAPQ is calculated mapq=int(-10*log10(1-1/Nmap)). By default we set
    the mapq_th to 0 to include all reads. Mapq score is very useful,
    because values coming from STAR are from a very limited set: 0 (5 or
    more multiple hits), 1 (4 or 3 multiple hits), 3 (2 multiple hits),
    255 (single hit)

    Parameters
    ----------
    bam : str
        Input BAM file with mapped reads.
    sites_single : str
        Output BED6 file to store data from single mapped reads.
    sites_multi : str
        Output BED6 file to store data from single and multi-mapped reads.
    skipped : str
        Output BAM file to store reads that do not map as expected by segmentation and
        reference genome sequence. If read's second start does not fall on any of
        segmentation borders, it is considered problematic. If segmentation is not provided,
        every read in two parts with gap longer than gap_th is not used (skipped).
        All such reads are reported to the user for further exploration.
    group_by : str
        Assign score of a read to either 'start', 'middle' or 'end' nucleotide.
    quant : str
        Report number of 'cDNA' or number of 'reads'.
    segmentation : str
        File with custon segmentation format (obtained by ``iCount segment``).
    mapq_th : int
        Ignore hits with MAPQ < mapq_th.
    multimax : int
        Ignore reads, mapped to more than ``multimax`` places.
    report_progress : bool
        Switch to report progress.
    gap_th : int
        Reads with gaps less than gap_th are treated as if they have no gap.

    Returns
    -------
    iCount.Metrics
        Metrics object, storing analysis metadata.

    """
    iCount.log_inputs(LOGGER, level=logging.INFO)  # pylint: disable=protected-access

    assert sites_single.endswith(('.bed', '.bed.gz'))
    assert sites_multi.endswith(('.bed', '.bed.gz'))
    assert skipped.endswith(('.bam'))
    assert quant in ['cDNA', 'reads']
    assert group_by in ['start', 'middle', 'end']

    metrics = iCount.Metrics()

    single, multi = {}, {}
    progress = 0
    for (chrom, strand), new_progress, by_pos in _processs_bam_file(
            bam, metrics, mapq_th, skipped, segmentation, gap_th):
        if report_progress:
            # pylint: disable=protected-access
            progress = iCount._log_progress(new_progress, progress, LOGGER)

        single_by_pos = {}
        multi_by_pos = {}
        for xlink_pos, by_bc in by_pos.items():

            # count single mapped reads only
            _update(single_by_pos, _collapse(xlink_pos, by_bc, group_by, multimax=1))
            # count all reads mapped les than multimax times
            _update(multi_by_pos, _collapse(xlink_pos, by_bc, group_by, multimax=multimax))

        single.setdefault((chrom, strand), {}).update(single_by_pos)
        multi.setdefault((chrom, strand), {}).update(multi_by_pos)

    # Write output
    val_index = ['cDNA', 'reads'].index(quant)
    _save_dict(single, sites_single, val_index=val_index)
    LOGGER.info('Saved to BED file (single mapped reads): %s', sites_single)
    _save_dict(multi, sites_multi, val_index=val_index)
    LOGGER.info('Saved to BED file (multi-mapped reads): %s', sites_multi)

    return metrics
コード例 #10
0
def bed2bedgraph(bed,
                 bedgraph,
                 name='User Track',
                 description='User Supplied Track',
                 visibility=None,
                 priority=None,
                 color=None,
                 alt_color=None,
                 max_height_pixels=None):
    """
    Convert from BED6 to bedGraph format.

    For further explanation of parameters see:
    https://genome.ucsc.edu/goldenPath/help/customTrack.html#TRACK
    https://genome.ucsc.edu/goldenpath/help/trackDb/trackDbHub.html

    Parameters
    ----------
    bed : str
        Input BED6 file.
    bedgraph : str
        Output bedGraph file.
    name : str
        Track label. Should be shorter than 15 characters.
    description : str
        Track description. Should be shorter than 60 characters.
    visibility: str
        Define the initial display mode of the annotation track. Choose
        among "hide", "dense", "full", "pack" and "squish". Default is
        "dense".
    priority : int
        Defines the track's order relative to other tracks in same group.
    color : str
        Define the main color for the annotation track. The track color
        consists of three comma-separated RGB values from 0-255, e.g
        RRR,GGG,BBB. The default value is 0,0,0 (black).
    alt_color : str
        Allow a color range that varies from color to alt_color.
    max_height_pixels : str
        The limits of vertical viewing space for track, though it is
        configurable by the user. Should be of the format <max:default:min>.

    Returns
    -------
    None

    """
    iCount.log_inputs(LOGGER, level=logging.INFO)  # pylint: disable=protected-access
    LOGGER.info('Checking input parameters...')
    assert bed.endswith(('.bed', '.bed.gz'))
    assert bedgraph.endswith('.bedgraph')

    header = [
        'track',
        'type=bedGraph',
        'name="{}"'.format(name),
        'description="{}"'.format(description),
    ]
    if visibility:
        assert re.fullmatch(r'hide|dense|full|pack|squish', visibility)
        header.append('visibility={}'.format(visibility))
    if priority:
        assert re.fullmatch(r'\d+', str(priority))
        header.append('priority={}'.format(priority))
    if color:
        assert re.fullmatch(r'\d{1,3},\d{1,3},\d{1,3}', color)
        header.append('color={}'.format(color))
    if alt_color:
        assert re.fullmatch(r'\d{1,3},\d{1,3},\d{1,3}', alt_color)
        header.append('altColor={}'.format(alt_color))
    if max_height_pixels:
        assert re.fullmatch(r'\d{1,3}:\d{1,3}:\d{1,3}', max_height_pixels)
        header.append('maxHeightPixels={}'.format(max_height_pixels))

    LOGGER.info('Converting %s to %s', bed, bedgraph)
    with open(bedgraph, 'wt') as outfile:
        outfile.write(' '.join(map(str, header)) + '\n')
        for line in pybedtools.BedTool(bed):
            bg_line = [
                line.chrom, line.start, line.stop,
                '{}{}'.format(line.strand, line.score)
            ]
            outfile.write('\t'.join(map(str, bg_line)) + '\n')
    LOGGER.info('Done.')
コード例 #11
0
def annotation(species, release, out_dir=None, annotation=None):
    """
    Download GENCODE annotation for given release/species.

    Note: This will download the "primary assembly" type of annotation.

    Parameters
    ----------
    species : str
        Species name.
    release : str
        Release number.
    out_dir : str
        Download to this directory (if not given, current working directory).
    annotation : str
        Annotation filename (must have .gz file extension). If not given
        original filename will be used. If annotation is provided as absolute
        path, value of out_dir parameter is ignored and file is saved to given
        absolute path.

    Returns
    -------
    str
        Downloaded annotation filename.

    """
    iCount.log_inputs(LOGGER, level=logging.INFO)

    if species not in iCount.genomes.gencode.species():
        raise ValueError('Invalid species name.')

    if str(release) not in iCount.genomes.gencode.releases(species):
        raise ValueError('Invalid release number.')

    if annotation:
        assert annotation.endswith(('.gtf', '.gtf.gz'))

    # If absolute path is given, ignore out_dir:
    if annotation is not None and os.path.isabs(annotation):
        if out_dir:
            LOGGER.info('out_dir parameter has been changed, since absolute '
                        'path is provided by annotation parameter.')
        out_dir = os.path.dirname(annotation)
        annotation = os.path.basename(annotation)
    if not out_dir:
        out_dir = os.getcwd()
    if not os.path.isdir(out_dir):
        raise ValueError('Directory "{}" does not exist'.format(out_dir))

    ftp = iCount.genomes.get_ftp_instance(BASE_URL)
    ftp.cwd('/pub/gencode/Gencode_{}/release_{}/'.format(species, release))
    server_files = ftp.nlst()

    # Get the desired annotation file form list of all server files:
    regex = r'gencode\.v{}\.primary_assembly\.annotation\.gtf\.gz'.format(
        release)
    annotation_file = next(
        (fname for fname in server_files if re.match(regex, fname)), None)

    if not annotation_file:
        LOGGER.info('No GTF file found for species %s, release %s', species,
                    str(release))
        return None

    target_path = os.path.join(out_dir, annotation or annotation_file)
    LOGGER.info('Downloading GTF to: %s', target_path)
    with open(target_path, 'wb') as fhandle:
        ftp.retrbinary('RETR ' + annotation_file, fhandle.write)
    ftp.quit()

    LOGGER.info('Done.')
    return os.path.abspath(target_path)
コード例 #12
0
def run(reads,
        adapter,
        barcodes,
        mismatches=1,
        minimum_length=15,
        prefix='demux',
        out_dir='.'):
    """
    Demultiplex FASTQ file.

    Split input FASTQ file into separate files, one for each barcode, and
    additional file for non-matching barcodes.

    Parameters
    ----------
    reads : str
        Path to reads from a sequencing library.
    adapter : str
        Adapter sequence to remove from ends of reads.
    barcodes : list_str
        List of barcodes used for library.
    mismatches : int
        Number of tolerated mismatches when comparing barcodes.
    minimum_length : int
        Minimum length of trimmed sequence to keep.
    prefix : str
        Prefix of generated FASTQ files.
    out_dir : str
        Output folder. Use local folder if none given.

    Returns
    -------
    str
        List of filenames where separated reads are stored.

    """
    iCount.log_inputs(LOGGER, level=logging.INFO)
    metrics = iCount.Metrics()
    metrics.reads_ok = 0
    metrics.reads_fail = 0

    if not os.path.isdir(out_dir):
        raise FileNotFoundError(
            'Output directory does not exist. Make sure it does.')

    out_fnames = [
        os.path.abspath(
            os.path.join(out_dir, '{}_{}_tmp.fastq.gz'.format(prefix,
                                                              barcode)))
        for barcode in barcodes + ['nomatch']
    ]

    LOGGER.info('Demultiplexing...')
    # Make list of file handles - one for each output filename:
    out_fastqs = [
        iCount.files.fastq.FastqFile(fname, 'wt') for fname in out_fnames
    ]

    # Determine experiment ID and random barcode for each fastq entry:
    kwargs = {'mismatches': mismatches, 'minimum_length': minimum_length}
    for fq_entry, exp_id, randomer in _extract(reads, barcodes, **kwargs):
        if randomer:
            metrics.reads_ok += 1
            if fq_entry.id[-2] == '/':
                # For early versions of Illumina, keep mate info at the end:
                r_pair = fq_entry.id[-2:]
                fq_entry.id = '{}:rbc:{}{}'.format(fq_entry.id[:-2], randomer,
                                                   r_pair)
            else:
                fq_entry.id = '{}:rbc:{}'.format(fq_entry.id, randomer)
        else:
            metrics.reads_fail += 1
        out_fastqs[exp_id].write(fq_entry)

    for out_fastq in out_fastqs:
        out_fastq.close()

    # Finally, remove adapters (if requested) or just rename to final names:
    out_fnames_final = [
        '{}.fastq.gz'.format(fname[:-13]) for fname in out_fnames
    ]
    for fname_in, fname_out in zip(out_fnames, out_fnames_final):
        if adapter and 'nomatch' not in fname_in:
            remove_adapter(fname_in,
                           fname_out,
                           adapter,
                           minimum_length=minimum_length)
            os.remove(fname_in)
        else:
            shutil.move(fname_in, fname_out)

    return metrics
コード例 #13
0
def genome(species, release, out_dir=None, genome=None):
    """
    Download GENCODE genome for given release/species.

    Note: This will download the "primary assembly" type of genome.

    Parameters
    ----------
    species : str by using samtools faidx
        Species latin name.
    release : str
        Release number. Only gencode releases {0}-{1} are available.
    out_dir : str
        Download to this directory (if not given, current working directory).
    genome : str
        Genome filename (must have .gz file extension). If not given original
        filename will be used. If genome is provided as absolute path, value
        of out_dir parameter is ignored and file is saved to given absolute
        path.

    Returns
    -------
    str
        Downloaded genome/sequnce filename.

    """
    iCount.log_inputs(LOGGER, level=logging.INFO)

    if species not in iCount.genomes.gencode.species():
        raise ValueError('Invalid species name.')

    if str(release) not in iCount.genomes.gencode.releases(species):
        raise ValueError('Invalid release number.')

    if genome:
        assert genome.endswith(('.fa', '.fasta', '.fa.gz', '.fasta.gz'))

    # If absolute path is given, ignore out_dir:
    if genome is not None and os.path.isabs(genome):
        if out_dir:
            LOGGER.info('out_dir parameter has been changed, since absolute '
                        'path is provided by genome parameter.')
        out_dir = os.path.dirname(genome)
        genome = os.path.basename(genome)
    if not out_dir:
        out_dir = os.getcwd()
    if not os.path.isdir(out_dir):
        raise ValueError('Directory "{}" does not exist'.format(out_dir))

    ftp = iCount.genomes.get_ftp_instance(BASE_URL)
    ftp.cwd('/pub/gencode/Gencode_{}/release_{}/'.format(species, release))
    regex = r'[\w\d\.]+\.primary_assembly\.genome\.fa\.gz'
    fasta_file = next(
        (fname for fname in ftp.nlst() if re.match(regex, fname)), None)

    target_path = os.path.abspath(os.path.join(out_dir, genome or fasta_file))
    LOGGER.info('Downloading FASTA file into: %s', target_path)
    with open(target_path, 'wb') as fhandle:
        ftp.retrbinary('RETR ' + fasta_file, fhandle.write)
    ftp.quit()

    # Compute chromosome lengths:
    iCount.genomes.ensembl.chrom_length(target_path)

    LOGGER.info('Done.')
    return target_path
コード例 #14
0
def get_segments(annotation, segmentation, fai, report_progress=False):
    """
    Create GTF file with transcript level segmentation.

    Each line in this file should define one of the following elements:

        * gene
        * transcript
        * CDS
        * UTR3
        * UTR5
        * intron
        * ncRNA
        * intergenic

    Name of third field (interval.fields[2]) should correspond to one
    of theese names. Only consider GTF entries of chromosomes given in
    fai file.

    Parameters
    ----------
    annotation : str
        Path to input GTF file.
    segmentation : str
        Path to output GTF file.
    fai : str
        Path to input genome_file (.fai or similar).
    report_progress : bool
        Show progress.

    Returns
    -------
    str
        Absolute path to output GTF file.

    """
    iCount.log_inputs(LOGGER, level=logging.INFO)
    metrics = iCount.Metrics()
    metrics.genes = 0

    # Container for storing intermediate data
    data = []

    LOGGER.debug('Opening genome file: %s', fai)
    fai = _first_two_columns(fai)
    with open(fai) as gfile:
        chromosomes = [line.strip().split()[0] for line in gfile]

    def process_gene(gene_content):
        """
        Process each group of intervals belonging to gene.

        Process each transcript_group in gene_content, add 'biotype'
        attribute to all intervals and include them in `data`.
        """
        assert 'gene' in gene_content

        for id_, transcript_group in gene_content.items():
            if id_ == 'gene':
                continue
            gene_content[id_] = _process_transcript_group(transcript_group)

        # Add biotype attribute to all intervals:
        gene_content = _add_biotype_attribute(gene_content)

        for id_, transcript_group in gene_content.items():
            if id_ == 'gene':
                continue
            data.extend(transcript_group)
        data.append(gene_content['gene'])

    LOGGER.debug('Processing genome annotation from: %s', annotation)
    for gene_content in _get_gene_content(annotation, chromosomes,
                                          report_progress):
        process_gene(gene_content)
        LOGGER.debug('Just processed gene: %s',
                     gene_content['gene'].attrs['gene_id'])
        metrics.genes += 1

    # Produce GTF/GFF file from data:
    gtf = BedTool(i.fields for i in data).saveas()

    LOGGER.info('Calculating intergenic intervals...')
    intergenic_pos = _complement(gtf.fn, fai, '+')
    intergenic_neg = _complement(gtf.fn, fai, '-')

    # Join the gtf, intergenic_pos and intergenic_neg in one file:
    file2 = tempfile.NamedTemporaryFile(delete=False)
    for infile in [gtf.fn, intergenic_pos, intergenic_neg]:
        shutil.copyfileobj(open(infile, 'rb'), file2)
    file2.close()

    file3 = BedTool(file2.name).sort().saveas(segmentation)
    LOGGER.info('Segmentation stored in %s', file3.fn)

    LOGGER.info('Making also gene level segmentation...')
    make_regions(segmentation,
                 out_dir=os.path.dirname(os.path.abspath(segmentation)))
    return metrics
コード例 #15
0
def make_summary_report(annotation,
                        sites,
                        summary,
                        fai,
                        types_length_file=None,
                        digits='8',
                        subtype=None,
                        excluded_types=None):
    """
    Make summary report from cross-link and annotation data.

    In the context of this report "type" equals to the combination of 3rd
    column and attribute `subtype` from annotation file (GTF).
    "Regions" are parts of transcript (UTR, CDS, introns...) that "non-
    intersectingly" span the whole transcript. Intergenic regions are also
    considered as region. Each region has one and only one type.

    For each of such types, number of cross-link events/sites is counted.
    Sites = sum of *all* cross-link sites that are in regions of some type.
    Events = sum of col5 for *all* cross-link sites that are in regions of
    some type (multiple events can happen on each cross link). Col5 is
    numerical value in 5th column of cross-link file.

    Parameters
    ----------
    annotation : str
        Path to annotation GTF file (should include subtype attribute).
    sites : str
        Path to BED6 file listing cross-linked sites.
    summary : str
        Path to output tab-delimited file with summary statistics.
    fai : str
        Path to file with chromosome lengths.
    types_length_file : str
        Path to file with lengths of each type.
    digits : int
        Number of decimal places in results.
    subtype : str
        Name of attribute to be used as subtype.
    excluded_types : list_str
        Types listed in 3rd column of GTF to be exclude from analysis.

    Returns
    -------
    str
        Path to summary report file (equal to parameter out_file)

    """
    iCount.log_inputs(LOGGER, level=logging.INFO)
    metrics = iCount.Metrics()

    # If not given/present, make file with cumulative length for each type:
    if not types_length_file or not os.path.isfile(types_length_file):
        LOGGER.info('types_length_file not given - calculating it')
        types_length_file, annotation = make_types_length_file(
            annotation, fai, subtype=subtype, excluded_types=excluded_types)
    # read the file to dict, named type_lengths
    type_lengths = {}
    with open(types_length_file) as tfile:
        for line in tfile:
            parts = line.strip().split()
            type_lengths[' '.join(parts[:-1])] = int(parts[-1])

    # sorted=True - invokes memory efficient algorithm for large files
    # s=True - only report hits in B that overlap A on the same strand
    # wb=True - Write the original entry in B for each overlap
    cross_links = pybedtools.BedTool(sites).sort().saveas()
    annotation = pybedtools.BedTool(annotation).sort().saveas()
    LOGGER.info(
        'Calculating intersection between cross-link and annotation...')
    overlaps = cross_links.intersect(annotation, sorted=True, s=True,
                                     wb=True).saveas()
    try:
        # this will raise TypeError if overlaps is empty:
        overlaps[0]
    except (IndexError, TypeError):
        raise ValueError('No intersections found. This may be caused by '
                         'different naming of chromosomes in annotation and '
                         'cross-links file (example: "chr1" vs. "1")')

    # dict structure = type_: [# of sites, # of events]
    type_counter = {type_: [0, 0] for type_ in type_lengths}
    site_types = []
    previous_segment = overlaps[0]

    def finalize(types, segment):
        """Increase counter for all types that intersect with segment."""
        for type_ in set(types):
            type_counter[type_][0] += 1  # sites
            type_counter[type_][1] += int(segment[4])  # events

    LOGGER.info('Extracting summary from data...')
    for segment in overlaps:
        # detect if segment contains new cross-link site:
        if segment.start != previous_segment.start or segment.strand != previous_segment.strand:
            finalize(site_types, previous_segment)
            site_types = []
        if subtype:
            # Extract subtype attribute:
            stype = re.match(r'.*{} "(.*)";'.format(subtype), segment[-1])
            stype = stype.group(1) if stype else None
            site_types.append(
                ' '.join([segment[8], stype] if stype else [segment[8]]))
        else:
            site_types.append(segment[8])
        previous_segment = segment
    finalize(site_types, previous_segment)

    # Produce report file:
    header = [
        'type', 'length', 'length %', 'sites #', 'sites %', 'sites enrichment',
        'events #', 'events %', 'events enrichment'
    ]
    sum_sites = sum([i[0] for i in type_counter.values()])
    sum_events = sum([i[1] for i in type_counter.values()])

    # total genome len = sum_of_all_chrom_length * 2 (there are + and - strand):
    total_length = sum([int(line.strip().split()[1])
                        for line in open(fai)]) * 2
    with open(summary, 'wt') as out:
        out.write('\t'.join(header) + '\n')
        for type_, [sites, events] in sorted(type_counter.items()):
            length_percent = type_lengths[type_] / total_length
            site_percent = sites / sum_sites
            site_enrichment = site_percent / length_percent
            event_percent = events / sum_events
            event_enrichment = event_percent / length_percent
            line = [
                type_, type_lengths[type_], length_percent, sites,
                site_percent, site_enrichment, events, event_percent,
                event_enrichment
            ]
            line = line[:1] + [round(i, int(digits)) for i in line[1:]]
            out.write('\t'.join(map(str, line)) + '\n')

    LOGGER.info('Done. Results saved to: %s', os.path.abspath(summary))
    return metrics
コード例 #16
0
ファイル: sigxls.py プロジェクト: ulelab/iCount-Mini
def run(annotation,
        sites,
        sigxls,
        scores=None,
        features=None,
        group_by='gene_id',
        merge_features=False,
        half_window=3,
        fdr=0.05,
        perms=100,
        rnd_seed=42,
        report_progress=False):
    """
    Find positions with high density of cross-linked sites.

    When determining feature.name, value of the first existing attribute in the
    following tuple is taken::

        ("ID", "gene_name", "transcript_id", "gene_id", "Parent")

    Source in pybedtools:
    https://github.com/daler/pybedtools/blob/master/pybedtools/scripts/annotate.py#L34

    Parameters
    ----------
    annotation : str
        Annotation file in GTF format, obtained from "iCount segment" command.
    sites : str
        File with cross-links in BED6 format.
    sigxls : str
        File name for "sigxls" output. File reports positions with significant
        number of cross-link events. It should have .bed or .bed.gz extension.
    scores : str
        File name for "scores" output. File reports all cross-link events,
        independent from their FDR score It should have .tsv, .csv, .txt or .gz
        extension.
    features : list_str
        Features from annotation to consider. If None, ['gene'] is used.
        Sometimes, it is advised to use ['gene', 'intergenic'].
    group_by : str
        Attribute by which cross-link positions are grouped.
    merge_features : bool
        Treat all features as one when grouping. Has no effect when only one
        feature is given in features parameter.
    half_window : int
        Half-window size.
    fdr : float
        FDR threshold.
    perms : int
        Number of permutations when calculating random distribution.
    rnd_seed : int
        Seed for random generator.
    report_progress : bool
        Report analysis progress.

    Returns
    -------
    iCount.metrics
        Analysis metadata.

    """
    iCount.log_inputs(LOGGER, level=logging.INFO)
    metrics = iCount.Metrics()

    if features is None:
        features = ['gene']
    assert sigxls.endswith(('.bed', '.bed.gz'))
    if scores:
        assert scores.endswith(
            ('.tsv', '.tsv.gz', '.csv', '.csv.gz', 'txt', 'txt.gz'))
    numpy.random.seed(rnd_seed)  # pylint: disable=no-member

    LOGGER.info('Loading annotation file...')
    annotation2 = iCount.files.decompress_to_tempfile(annotation)
    if annotation2 != annotation:
        to_delete_temp = annotation2
        annotation = annotation2
    else:
        to_delete_temp = None
    annotation = pybedtools.BedTool(annotation).saveas()
    metrics.annotation_all = len(annotation)
    annotation = annotation.filter(lambda x: x[2] in features).sort().saveas()
    metrics.annotation_used = len(annotation)
    metrics.annotation_skipped = metrics.annotation_all - metrics.annotation_used
    LOGGER.info('%d out of %d annotation records will be used (%d skipped).',
                metrics.annotation_used, metrics.annotation_all,
                metrics.annotation_skipped)

    LOGGER.info('Loading cross-links file...')
    sites = pybedtools.BedTool(sites).sort().saveas()

    # intersect cross-linked sites with regions
    LOGGER.info(
        'Calculating intersection between annotation and cross-link file...')
    overlaps = annotation.intersect(sites, sorted=True, s=True,
                                    wo=True).saveas()

    groups = {}
    group_sizes = {}
    multi_mode = len(features) > 1 and not merge_features
    LOGGER.info('Processing intersections...')
    for feature in overlaps:
        chrom = feature.chrom
        start = feature.start
        end = feature.stop
        name = feature.name
        strand = feature.strand
        site_chrom = feature.fields[9]
        site_pos = int(feature.fields[10])
        site_end = int(feature.fields[11])
        site_dot = feature.fields[12]
        site_score = float(feature.fields[13])
        site_strand = feature.fields[14]
        assert site_chrom == chrom
        assert site_strand == strand
        assert site_dot == '.'
        assert site_pos == site_end - 1

        # Determine group_id depending on multi_mode...
        group_id = feature.attrs[group_by]
        if multi_mode:
            group_id = feature[2] + '_' + group_id

        groups.setdefault((chrom, strand, group_id, name), []).append(
            (site_pos, site_score))
        group_sizes.setdefault((chrom, strand, group_id, name), set()).add(
            (start, end))

    # Validate that segments in same group do not overlap: start of next feature
    # is greater than stop of the current one:
    for sizes in group_sizes.values():
        sizes = sorted(sizes)
        for first, second in zip(sizes, sizes[1:]):
            assert first[1] < second[0]

    # calculate total length of each group by summing element sizes:
    group_sizes = dict([(name, sum([end - start for start, end in elements]))
                        for name, elements in group_sizes.items()])

    # calculate and assign FDRs to each cross-linked site. FDR values are
    # calculated together for each group.
    results = {}
    metrics.all_groups = len(groups)
    progress, j = 0, 0
    for (chrom, strand, group_id, name), hits in sorted(groups.items()):
        j += 1
        if report_progress:
            new_progress = j / metrics.all_groups
            # pylint: disable=protected-access
            progress = iCount._log_progress(new_progress, progress, LOGGER)

        group_size = group_sizes[(chrom, strand, group_id, name)]

        # Crucial step: each position in a group is given a fdr_score, based on
        # hits in group, group_size, half-window size and number of
        # permutations. Than, FDR scores (+ some other info) are written to
        # `results` container:
        processed = _process_group(hits, group_size, half_window, perms)
        for (pos, val, val_extended, fdr_score) in processed:
            results.setdefault((chrom, pos, strand), []).\
                append((fdr_score, name, group_id, val, val_extended))
    metrics.positions_annotated = len(results)

    # cross-linked sites outside annotated regions
    LOGGER.info('Determining cross-links not intersecting with annotation...')
    skipped = sites.intersect(annotation, sorted=True, s=True, v=True).saveas()
    for feature in skipped:
        site_chrom = feature.chrom
        site_start = feature.start
        site_end = feature.stop
        # site_name = feature.name
        site_score = feature.score
        site_strand = feature.strand
        assert site_start == site_end - 1
        k = (site_chrom, site_start, site_strand)
        assert k not in results
        results.setdefault(k, []).\
            append((1.0, 'not_annotated', 'not_annotated', site_score, 'not_calculated'))

    metrics.positions_all = len(results)
    metrics.positions_not_annotated = metrics.positions_all - metrics.positions_annotated
    LOGGER.info(
        'Significant crosslinks calculation finished. Writing results to files...'
    )

    # Make sigxls: a BED6 file, with only the most significant cross-links:
    metrics.significant_positions = 0
    with iCount.files.gz_open(sigxls, 'wt') as sigxls:
        for (chrom, pos, strand), annot_list in sorted(results.items()):
            annot_list = sorted(annot_list)

            # report minimum fdr_score for each position in BED6
            min_fdr_score = annot_list[0][0]
            if min_fdr_score < fdr:
                metrics.significant_positions += 1
                # position has significant records - report the most significant ones:
                min_fdr_records = [
                    rec for rec in annot_list if rec[0] == min_fdr_score
                ]

                _, names, group_ids, group_scores, _ = zip(*min_fdr_records)
                if names == group_ids:
                    name = ','.join(names)
                else:
                    name = ','.join(names) + '-' + ','.join(group_ids)
                line = [chrom, pos, pos + 1, name, group_scores[0], strand]
                sigxls.write('\t'.join([_f2s(i, dec=4) for i in line]) + '\n')
    LOGGER.info('BED6 file with significant crosslinks saved to: %s',
                sigxls.name)

    # Make scores: a tab-separated file, with ALL cross-links, (no significance threshold)
    header = [
        'chrom', 'position', 'strand', 'name', 'group_id', 'score',
        'score_extended', 'FDR'
    ]
    if scores:
        with iCount.files.gz_open(scores, 'wt') as scores:
            scores.write('\t'.join(header) + '\n')
            for (chrom, pos, strand), annot_list in sorted(results.items()):
                for (fdr_score, name, group_id, score,
                     val_extended) in sorted(annot_list):
                    line = [
                        chrom, pos, strand, name, group_id, score,
                        val_extended, fdr_score
                    ]
                    scores.write('\t'.join([_f2s(i, dec=6)
                                            for i in line]) + '\n')
        LOGGER.info('Scores for each cross-linked position saved to: %s',
                    scores.name)

    if to_delete_temp:
        os.remove(to_delete_temp)

    LOGGER.info('Done.')
    return metrics
コード例 #17
0
ファイル: ensembl.py プロジェクト: ulelab/iCount-Mini
def genome(species,
           release=MAX_RELEASE_SUPPORTED,
           out_dir=None,
           genome=None,
           chromosomes=None):
    """
    Download ENSEMBL genome for given release/species.

    Parameters
    ----------
    species : str
        Species latin name.
    release : int
        Release number. Only ENSEMBL releases {0}-{1} are available.
    out_dir : str
        Download to this directory (if not given, current working directory).
    genome : str
        Genome filename (must have .gz file extension). If not given,
        species.release.fa.gz is used. If genome is provided as absolute path,
        value of out_dir parameter is ignored and file is saved to given
        absolute path.
    chromosomes : list_str
        If given, do not download the whole genome, but listed
        chromosomes only. Chromosomes can be given as strings or integers.

    Returns
    -------
    str
        Downloaded genome/sequnce filename.

    """
    iCount.log_inputs(LOGGER, level=logging.INFO)

    if species not in iCount.genomes.ensembl.species():
        raise ValueError('Invalid species name.')

    if not MIN_RELEASE_SUPPORTED <= release <= MAX_RELEASE_SUPPORTED:
        raise ValueError(
            'Release should be a number between {} and {}.'.format(
                MIN_RELEASE_SUPPORTED, MAX_RELEASE_SUPPORTED))

    if genome:
        assert genome.endswith(('.fa', '.fasta', '.fa.gz', '.fasta.gz'))

    # If absolute path is given, ignore out_dir:
    if genome and os.path.isabs(genome):
        if out_dir:
            LOGGER.info('out_dir parameter has been changed, since absolute '
                        'path is provided by genome parameter.')
        out_dir = os.path.dirname(genome)
        genome = os.path.basename(genome)
    if not out_dir:
        out_dir = os.getcwd()
    if not os.path.isdir(out_dir):
        raise ValueError('Directory "{}" does not exist.'.format(out_dir))

    ftp = iCount.genomes.get_ftp_instance(BASE_URL)
    server_dir = '/pub/release-{}/fasta/{}/dna'.format(release, species)
    ftp.cwd(server_dir)
    all_fasta_files = ftp.nlst()

    filtered_files = []

    # Recognize all "whole-chromosme" files
    regex = r'{}\.[\w\d\-]+\.(\d+\.)*dna\.chromosome\.' \
            r'[a-zA-Z0-9]+\.fa\.gz'.format(species.capitalize())
    for fasta_file in all_fasta_files:
        if re.match(regex, fasta_file):
            filtered_files.append(fasta_file)

    def sorting_func(name):
        """Sort names by chromosome order."""
        key = name.split('.')[-3]
        if key == 'MT':
            return 'ZZ'  # this makes mitohondrial "chromosome" the last one
        return key

    # Sort the filtered_files in correct order of chromosomes:
    filtered_files.sort(key=sorting_func)

    # If parameter chromosomes is given, take only a subset of all chromosomes
    if chromosomes:
        subset_list = []
        for chromosome in chromosomes:
            regex = r'.*chromosome\.{}\.fa\.gz'.format(str(chromosome))
            chromosome_found = False
            for file_ in filtered_files:
                if re.match(regex, file_):
                    subset_list.append(file_)
                    chromosome_found = True
            if not chromosome_found:
                # Provide user with a list of available chromosomes:
                chrom_regex = r'.*chromosome\.([a-zA-Z0-9]+)\.fa\.gz'
                available = [
                    re.match(chrom_regex, file_).group(1)
                    for file_ in filtered_files
                ]
                raise ValueError(
                    'Could not find chromosome {}. Available chromosomes '
                    'are: {}'.format(chromosome, ' '.join(available)))
        filtered_files = subset_list
        if not genome:
            genome = '{}.{}.chr{}.fa.gz'.format(
                species, release, '_'.join(map(str, chromosomes)))

    if not genome:
        genome = '{}.{}.fa.gz'.format(species, release)
    target_path = os.path.abspath(os.path.join(out_dir, genome))

    LOGGER.info('Downloading FASTA file into: %s', target_path)
    temp_dir = os.path.join(iCount.TMP_ROOT, 'ensembl')
    if not os.path.exists(temp_dir):
        os.makedirs(temp_dir)
    for fname in filtered_files:
        LOGGER.debug('Downloading file: %s', fname)
        # Download all files to tempdir:
        temp_file = os.path.join(temp_dir, fname)
        with open(temp_file, 'wb') as fhandle:
            ftp.retrbinary('RETR ' + fname, fhandle.write)

        # Write the content into final file (target_fname)
        with gzip.open(temp_file, 'rb') as f_in, gzip.open(target_path,
                                                           'ab') as f_out:
            shutil.copyfileobj(f_in, f_out)

    # Clean up: delete temporary files:
    ftp.quit()
    shutil.rmtree(temp_dir)

    # Compute chromosome lengths:
    chrom_length(target_path)

    LOGGER.info('Done.')
    return target_path
コード例 #18
0
ファイル: demultiplex.py プロジェクト: yx-xu/iCount
def run(reads,
        adapter,
        barcodes5,
        barcodes3=None,
        mismatches=1,
        minimum_length=15,
        min_adapter_overlap=7,
        prefix='demux',
        out_dir='.'):
    """
    Demultiplex FASTQ file.

    Split input FASTQ file into separate files, one for each barcode, and
    additional file for non-matching barcodes. Write random barcode of a read
    into it's FASTQ header row.

    Parameters
    ----------
    reads : str
        Sequencing reads.
    adapter : str
        Adapter sequence to remove from 3-prime end of reads.
    barcodes5 : list_str
        List of 5-prime end barcodes.
    barcodes3 : list_str
        List of 3-prime end barcodes.
    mismatches : int
        Number of tolerated mismatches when comparing barcodes.
    minimum_length : int
        Minimum length of trimmed sequence to keep.
    min_adapter_overlap : int
        Minimum length of adapter on 3' end if demultiplexing also on
        3' barcodes.
    prefix : str
        Prefix of generated FASTQ files.
    out_dir : str
        Output folder. Use current folder if none is given.

    Returns
    -------
    iCount.Metrics
        Metrics object, storing analysis metadata.

    """
    iCount.log_inputs(LOGGER, level=logging.INFO)
    metrics = iCount.Metrics()
    metrics.reads_ok = 0
    metrics.reads_fail = 0

    kwargs = {
        'mismatches': mismatches,
        'minimum_length': minimum_length,
        'prefix': prefix,
        'out_dir': out_dir,
        'metrics': metrics,
    }

    if not os.path.isdir(out_dir):
        raise FileNotFoundError(
            'Output directory does not exist. Make sure it does.')

    # This should be a dict, where each 5' barcode has al list of it's 3' barcods
    barcodes = prepare_barcodes(barcodes5, barcodes3)

    LOGGER.info("Demultiplexing based on 5' barcodes...")
    # Demultiplex by 5' barcodes only. Store in files. Just like before.
    demultiplex(reads=reads, barcodes=barcodes, **kwargs)

    os.rename(
        os.path.join(out_dir, 'demux_nomatch.fastq.gz'),
        os.path.join(out_dir, 'demux_nomatch5.fastq.gz'),
    )

    LOGGER.info("Demultiplexing based on 3' barcodes...")
    for barcode5 in barcodes:
        reads5 = os.path.join(out_dir, 'demux_{}.fastq.gz'.format(barcode5))
        barcodes3 = barcodes[barcode5]['barcodes3']

        if not barcodes3:
            # This barcode has no 3' counterparts. Just remove the adapter and continue
            # TODO: polish the parameters for adapter removal in this case...
            remove_adapter(reads5, adapter, overwrite=True)
            continue

        # One must be sure that there actually are 3' barcodes on the
        # 3' end. In cca. 20-30% of cases read is so short that it does
        # not reach the 3' barcode. In such cases, demultiplexing by 3'
        # end would be done by random chance which is not acceptable not
        # good. To be sure that 3' barcode is reached, read needs to
        # contain at least ``adapter_overlap`` bp of the adapter.

        no_adapters = os.path.join(
            out_dir, "no_adapter_found_{}.fastq.gz".format(barcode5))
        remove_adapter(reads5,
                       adapter,
                       overwrite=True,
                       overlap=min_adapter_overlap,
                       untrimmed_output=no_adapters)

        # Fix the prefix, to include 5' barcode info:
        kwargs['prefix'] = '{}_{}'.format(prefix, barcode5)
        kwargs['mismatches'] = 0

        # Now, demutiplex based on 3' adapter
        demultiplex(reads=reads5, barcodes=barcodes3, **kwargs)

        # File that is demultiplexed only by 5'end is not needed anymore
        os.remove(reads5)

    # TODO: merge nomatch files 3' and 5' end... This may be many!

    return metrics
コード例 #19
0
ファイル: segment.py プロジェクト: nebo56ucl/iCount
def get_regions(annotation, segmentation, fai, report_progress=False):
    """
    Create new gtf file with custom annotation and filtered content.

    Each line in new file should define one of the following elements:

        * gene
        * transcript
        * CDS
        * intron
        * UTR3
        * UTR5
        * stop_codon
        * ncRNA
        * intergenic

    Name of third field (interval.fields[2]) should correspond to one
    of theese names. Only consider GTF entries of chromosomes given in
    genome_file.

    Parameters
    ----------
    annotation : str
        Path to input GTF file.
    segmentation : str
        Path to output GTF file.
    fai : str
        Path to input genome_file (.fai or similar).
    report_progress : bool
        Switch to show progress.

    Returns
    -------
    str
        Absolute path to output GTF file.

    """
    iCount.log_inputs(LOGGER, level=logging.INFO)

    # Container for storing intermediate data
    data = []

    LOGGER.debug('Opening genome file: %s', fai)

    # Keep just first two column or _complement will fail...
    fai = _first_two_columns(fai)
    with open(fai) as gfile:
        chromosomes = [line.strip().split()[0] for line in gfile]

    def process_gene(gene_content):
        """
        Process each group of intervals belonging to gene.

        Process each transcript_group in gene_content, add 'biotype'
        attribute to all intervals and include them in `data`.
        """
        assert 'gene' in gene_content

        for id_, transcript_group in gene_content.items():
            if id_ == 'gene':
                continue
            gene_content[id_] = _process_transcript_group(transcript_group)

        # Add biotype attribute to all intervals:
        gene_content = _add_biotype_attribute(gene_content)

        for id_, transcript_group in gene_content.items():
            if id_ == 'gene':
                continue
            data.extend(transcript_group)
        data.append(gene_content['gene'])

    LOGGER.debug('Processing genome annotation from: %s', annotation)
    for gene_content in _get_gene_content(annotation, chromosomes,
                                          report_progress):
        process_gene(gene_content)
        LOGGER.debug('Just processed gene: %s',
                     gene_content['gene'].attrs['gene_id'])
    # This can be replaced with: multiprocessing.Pool, but it causes huge
    # memory usage. Possible explanation and solution:
    # http://stackoverflow.com/questions/21485319/high-memory-usage-using-python-multiprocessing
    # TODO: check and fix execution in parallel, look example
    # https://daler.github.io/pybedtools/3-brief-examples.html#example-3-count-reads-in-introns-and-exons-in-parallel
    # p = multiprocessing.Pool(threads, maxtasksperchild=100)
    # p.map(process_gene, _get_gene_content(gtf_in, chromosomes))

    # Produce GTF/GFF file from data:
    gtf = pybedtools.BedTool(i.fields for i in data).saveas()

    LOGGER.info('Calculating intergenic regions...')
    intergenic_pos = _complement(gtf.fn, fai, '+')
    intergenic_neg = _complement(gtf.fn, fai, '-')

    # Join the gtf, intergenic_pos and intergenic_neg in one file:
    file2 = tempfile.NamedTemporaryFile(delete=False)
    for infile in [gtf.fn, intergenic_pos, intergenic_neg]:
        shutil.copyfileobj(open(infile, 'rb'), file2)
    file2.close()

    file3 = pybedtools.BedTool(file2.name).sort().saveas(segmentation)
    LOGGER.info('Segmentation stored in %s', file3.fn)
    return file3.fn
コード例 #20
0
ファイル: star.py プロジェクト: ulelab/iCount-Mini
def build_index(genome,
                genome_index,
                annotation='',
                overhang=100,
                overhang_min=8,
                threads=1,
                genome_sasparsed=1,
                genome_saindexnbases=14):
    """
    Call STAR to generate genome index, which is used for mapping.

    Parameters
    ----------
    genome : str
        Genome sequence to index.
    genome_index : str
        Output folder, where to store genome index.
    annotation : str
        Annotation that defines splice junctions.
    overhang : int
        Sequence length around annotated junctions to be used by STAR when
        constructing splice junction database.
    overhang_min : int
        Minimum overhang for unannotated junctions.
    threads : int
        Number of threads that STAR can use for generating index.
    genome_sasparsed : int
        STAR parameter genomeSAsparseD.
        Suffix array sparsity. Bigger numbers decrease RAM requirements
        at the cost of mapping speed reduction. Suggested values
        are 1 (30 GB RAM) or 2 (16 GB RAM).
    genome_saindexnbases : int
        STAR parameter genomeSAindexNbases.
        SA pre-indexing string length, typically between 10 and 15.
        Longer strings require more memory, but result in faster searches.

    Returns
    -------
    int
        Star return code.

    """
    iCount.log_inputs(LOGGER, level=logging.INFO)

    if not os.path.isdir(genome_index):
        raise FileNotFoundError(
            'Output directory does not exist. Make sure it does.')

    LOGGER.info('Building genome index with STAR for genome %s', genome)
    genome_fname2 = iCount.files.decompress_to_tempfile(genome, 'starindex')
    args = [
        'STAR',
        '--runThreadN',
        '{:d}'.format(threads),
        '--genomeSAsparseD',
        '{:d}'.format(genome_sasparsed),
        '--genomeSAindexNbases',
        '{:d}'.format(genome_saindexnbases),
        '--runMode',
        'genomeGenerate',
        '--genomeDir',
        '{:s}'.format(genome_index),
        '--genomeFastaFiles',
        '{:s}'.format(genome_fname2),
        '--alignSJoverhangMin',
        '{:d}'.format(overhang_min),
        '--outFileNamePrefix',
        '{:s}'.format(genome_index),
    ]
    if annotation:
        annotation2 = iCount.files.decompress_to_tempfile(
            annotation, 'starindex')
        args.extend([
            '--sjdbGTFfile',
            annotation2,
            '--sjdbOverhang',
            '{:d}'.format(overhang),
        ])
    else:
        annotation2 = annotation

    try:
        ret_code = 1
        for name, value in _execute(args):
            if name == 'return_code' and isinstance(value, int):
                ret_code = value
                break
            elif name == 'stdout_line':
                LOGGER.info(value.strip())
            elif name == 'stderr_lines':
                for line in value.split('\n'):
                    LOGGER.error(line.strip())
    finally:
        # remove temporary decompressed files
        if genome != genome_fname2:
            os.remove(genome_fname2)
        if annotation != annotation2:
            os.remove(annotation2)

    LOGGER.info('Done.')
    return ret_code
コード例 #21
0
ファイル: ensembl.py プロジェクト: ulelab/iCount-Mini
def annotation(species,
               release=MAX_RELEASE_SUPPORTED,
               out_dir=None,
               annotation=None):
    """
    Download ENSEMBL annotation for given release/species.

    Parameters
    ----------
    species : str
        Species latin name.
    release : int
        Release number. Only ENSEMBL releases {0}-{1} are available.
    out_dir : str
        Download to this directory (if not given, current working directory).
    annotation : str
        Annotation filename (must have .gz file extension). If not given,
        species.release.gtf.gz is used. If annotation is provided as absolute
        path, value of out_dir parameter is ignored and file is saved to given
        absolute path.

    Returns
    -------
    str
        Downloaded annotation filename.

    """
    iCount.log_inputs(LOGGER, level=logging.INFO)

    if not MIN_RELEASE_SUPPORTED <= release <= MAX_RELEASE_SUPPORTED:
        raise ValueError(
            'Release should be a number between {} and {}.'.format(
                MIN_RELEASE_SUPPORTED, MAX_RELEASE_SUPPORTED))

    if species not in iCount.genomes.ensembl.species(release):
        raise ValueError('Invalid species name.')

    if annotation:
        assert annotation.endswith(('.gtf', '.gtf.gz'))
    else:
        annotation = '{}.{}.gtf.gz'.format(species, release)

    # If absolute path is given, ignore out_dir:
    if os.path.isabs(annotation):
        if out_dir:
            LOGGER.info('out_dir parameter has been changed, since absolute '
                        'path is provided by annotation parameter.')
        out_dir = os.path.dirname(annotation)
        annotation = os.path.basename(annotation)
    if not out_dir:
        out_dir = os.getcwd()
    if not os.path.isdir(out_dir):
        raise ValueError('Directory "{}" does not exist.'.format(out_dir))

    ftp = iCount.genomes.get_ftp_instance(BASE_URL)
    server_dir = '/pub/release-{}/gtf/{}/'.format(release, species)
    ftp.cwd(server_dir)
    server_files = ftp.nlst()

    # Get the desired annotation file form list of all server files:
    annotation_file = ''
    regex = r'{}\.[\w\d]+\.\d+\.gtf\.gz'.format(species.capitalize())
    for file_ in server_files:
        if re.match(regex, file_):
            annotation_file = file_
            break

    if not annotation_file:
        LOGGER.info('No GTF file found for species %s, release %s', species,
                    str(release))
        return None

    # Download to file on disk
    saveas_fname = os.path.join(out_dir, annotation)
    LOGGER.info('Downloading GTF to: %s', saveas_fname)
    with open(saveas_fname, 'wb') as fhandle:
        ftp.retrbinary('RETR ' + annotation_file, fhandle.write)
    ftp.quit()

    LOGGER.info('Done.')
    return os.path.abspath(saveas_fname)
コード例 #22
0
ファイル: summary.py プロジェクト: yx-xu/iCount
def summary_reports(annotation, sites, out_dir, templates_dir=None):
    """
    Make summary reports for a cross-link file.

    Parameters
    ----------
    annotation : str
        Annotation file (GTF format). It is recommended to use genome-level segmentation (e.g. regions.gtf.gz), that
        is produced by ``iCount segment`` command.
    sites : str
        Croslinks file (BED6 format). Should be sorted by coordinate.
    out_dir : str
        Output directory.
    templates_dir : str
        Directory containing templates for summary calculation. Made by ``iCount segment`` command. If this argument
        is not provided, summary templates are made on the fly.

    Returns
    -------
    iCount.Metrics
        iCount Metrics object.

    """
    iCount.log_inputs(LOGGER, level=logging.INFO)
    metrics = iCount.Metrics()

    if templates_dir is None:
        templates_dir = tempfile.mkdtemp()
        summary_templates(annotation, templates_dir)

    LOGGER.info(
        'Calculating intersection between cross-link and annotation...')
    # pylint: disable=too-many-function-args,unexpected-keyword-arg
    overlaps = BedTool(sites).intersect(
        BedTool(annotation),
        sorted=True,  # invokes memory efficient algorithm for large files
        s=True,  # only report hits in B that overlap A on the same strand
        wb=True,  # write the original entry in B for each overlap
        nonamecheck=
        True,  # Do not print warnings about name inconsistency to stdout
    ).saveas()
    # pylint: enable=too-many-function-args,unexpected-keyword-arg
    try:
        overlaps[0]  # will raise Error if overlaps is empty:
    except (IndexError, TypeError):
        raise ValueError(
            'No intersections found. This may be caused by different naming of chromosomes in annotation'
            'and cross-links file (example: "chr1" vs. "1")')

    type_counter, subtype_counter, gene_counter = {}, {}, {}
    LOGGER.info('Extracting summary data from intersection...')
    for segment in overlaps:
        score = int(segment.score)

        type_ = segment[8]
        type_counter[type_] = type_counter.get(type_, 0) + score

        biotype = re.match(r'.*biotype "(.*?)";', segment[-1])
        biotype = biotype.group(1) if biotype else ''
        biotypes = biotype.split(',')
        for biotype in biotypes:
            sbtyp = iCount.genomes.region.make_subtype(type_, biotype)
            subtype_counter[sbtyp] = subtype_counter.get(
                sbtyp, 0) + score / len(biotypes)

        gene_id = re.match(r'.*gene_id "(.*?)";', segment[-1])
        gene_id = gene_id.group(1) if gene_id else None
        gene_counter[gene_id] = gene_counter.get(gene_id, 0) + score

    sum_cdna = 0
    for seg in BedTool(sites):
        sum_cdna += int(seg.score)

    def parse_template(template_file):
        """Parse template file."""
        template = {}
        with open(template_file, 'rt') as ifile:
            for line in ifile:
                line = line.strip().split('\t')
                template[line[0]] = line[1:]
        return template

    LOGGER.info('Writing type report...')
    type_template = parse_template(os.path.join(templates_dir, TEMPLATE_TYPE))
    with open(os.path.join(out_dir, SUMMARY_TYPE), 'wt') as out:
        header = ['Type', 'Length', 'cDNA #', 'cDNA %']
        out.write('\t'.join(header) + '\n')
        for type_, cdna in sorted(type_counter.items(),
                                  key=lambda x: sort_types_subtypes(x[0])):
            line = [
                type_,
                type_template.get(type_, [-1])[0],
                math.floor(cdna), cdna / sum_cdna * 100
            ]
            out.write('\t'.join(map(str, line)) + '\n')

    LOGGER.info('Writing subtype report...')
    subtype_template = parse_template(
        os.path.join(templates_dir, TEMPLATE_SUBTYPE))
    with open(os.path.join(out_dir, SUMMARY_SUBTYPE), 'wt') as out:
        header = ['Subtype', 'Length', 'cDNA #', 'cDNA %']
        out.write('\t'.join(header) + '\n')
        for stype, cdna in sorted(subtype_counter.items(),
                                  key=lambda x: sort_types_subtypes(x[0])):
            line = [
                stype,
                subtype_template.get(stype, [-1])[0],
                math.floor(cdna), cdna / sum_cdna * 100
            ]
            out.write('\t'.join(map(str, line)) + '\n')

    LOGGER.info('Writing gene report...')
    gene_template = parse_template(os.path.join(templates_dir, TEMPLATE_GENE))
    with open(os.path.join(out_dir, SUMMARY_GENE), 'wt') as out:
        header = ['Gene name (Gene ID)', 'Length', 'cDNA #', 'cDNA %']
        out.write('\t'.join(header) + '\n')
        for gene_id, cdna in sorted(gene_counter.items()):
            gene_name, length = gene_template.get(gene_id, ['', -1])
            if gene_id == '.':
                gene_name = 'intergenic'
            line = [
                '{} ({})'.format(gene_name, gene_id), length,
                math.floor(cdna), cdna / sum_cdna * 100
            ]
            out.write('\t'.join(map(str, line)) + '\n')

    LOGGER.info('Done.')
    return metrics
コード例 #23
0
ファイル: demultiplex.py プロジェクト: nebo56ucl/iCount
def run(reads,
        adapter,
        barcodes,
        mismatches=1,
        minimum_length=15,
        prefix='demux',
        out_dir='.'):
    """
    Demultiplex FASTQ file.

    Split input FASTQ file into separate files, one for each barcode, and
    additional file for non-matching barcodes.

    Parameters
    ----------
    reads : str
        Path to reads from a sequencing library.
    adapter : str
        Adapter sequence to remove from ends of reads.
    barcodes : list_str
        List of barcodes used for library.
    mismatches : int
        Number of tolerated mismatches when comparing barcodes.
    minimum_length : int
        Minimum length of trimmed sequence to keep.
    prefix : str
        Prefix of generated FASTQ files.
    out_dir : str
        Output folder. Use local folder if none given.

    Returns
    -------
    str
        List of filenames where separated reads are stored.

    """
    iCount.log_inputs(LOGGER, level=logging.INFO)

    if not os.path.isdir(out_dir):
        raise FileNotFoundError(
            'Output directory does not exist. Make sure it does.')
    out_fn_prefix = []
    for barcode in ['nomatch'] + barcodes:
        out_fn_prefix.append(
            os.path.join(out_dir, '{}_{}'.format(prefix, barcode)))

    if adapter:
        # need to remove adapter
        out_fns = ['{:s}_raw.fastq.gz'.format(fn) for fn in out_fn_prefix]
    else:
        out_fns = ['{:s}.fastq.gz'.format(fn) for fn in out_fn_prefix]

    # demultiplex
    LOGGER.info('Allowing max %d mismatches in barcodes.', mismatches)
    LOGGER.info('Demultiplexing file: %s', reads)
    demultiplex(reads, out_fns[1:], out_fns[0], barcodes, mismatches,
                minimum_length)
    LOGGER.info('Saving results to:')
    for fn in out_fns:
        LOGGER.info('    %s', fn)

    # remove adapter, if requested
    if adapter:
        LOGGER.info('Trimming adapters (discarding shorter than %d)...',
                    minimum_length)
        out_fns_intermediate = out_fns
        out_fns = ['{:s}.fastq.gz'.format(fn) for fn in out_fn_prefix]
        for fn_in, fn_out in zip(out_fns_intermediate, out_fns):
            remove_adapter(fn_in,
                           fn_out,
                           adapter,
                           minimum_length=minimum_length)
            os.remove(fn_in)

    return out_fns
コード例 #24
0
def annotate_cross_links(annotation,
                         sites,
                         sites_annotated,
                         subtype='biotype',
                         excluded_types=None):
    """
    Annotate each cross-link site with all region types that intersect it.

    Each cross link can overlap/intersect with many intervals in annotation
    file. Each of these intervals has a given type. Make a copy of cross
    link file and include all these types in 4th column.

    In the context of this report "type" equals to the combination of 3rd column
    and attribute `subtype` from annotation file (GTF). Regions/intervals are
    parts of transcript (UTR, CDS, introns...) that "non- intersectingly" span
    the whole transcript. However, since transcripts can overlap, also intervals
    belonging to different transcripts can overlap. Intergenic regions are also
    considered as region. Each region has one and only one type.

    Parameters
    ----------
    annotation : str
        Path to annotation file (should be GTF and include `subtype` attribute).
    sites : str
        Path to input BED6 file listing all cross-linked sites.
    sites_annotated : str
        Path to output BED6 file listing annotated cross-linked sites.
    subtype : str
        Subtype.
    excluded_types : list_str
        Excluded types.

    Returns
    -------
    str
        Path to summary report file (should be equal to out_file parameter)

    """
    iCount.log_inputs(LOGGER, level=logging.INFO)
    metrics = iCount.Metrics()

    excluded_types = excluded_types or []
    cross_links = pybedtools.BedTool(sites).sort().saveas()
    annotation = pybedtools.BedTool(annotation).filter(
        lambda x: x[2] not in excluded_types).sort().saveas()

    LOGGER.info(
        'Calculating overlaps between cross-link and annotation_file...')
    overlaps = cross_links.intersect(annotation, sorted=True, s=True,
                                     wb=True).saveas()
    try:
        # this will raise TypeError if overlaps is empty:
        overlaps[0]
    except (IndexError, TypeError):
        raise ValueError('No intersections found. This may be caused by '
                         'different naming of chromosomes in annotation and'
                         'cross_links file ("chr1" vs. "1")')

    data = []  # cotainer for final annotated BED file intervals
    site_types = [
    ]  # cotainer for all types intersecting with given cross-link
    previous_interval = overlaps[0]

    def finalize(types, site):
        """Make annotated (with all intersecting types) cross link interval."""
        data.append(
            create_interval_from_list(
                site[0:3] + ['; '.join(map(str, sorted(set(types))))] +
                site[4:6]))

    for interval in overlaps:
        # Detect new cross link:
        if interval.start != previous_interval.start or \
           interval.strand != previous_interval.strand:
            finalize(site_types, previous_interval)
            site_types = []
        if subtype:
            # Extract subtype attribute:
            stype = re.match(r'.*{} "(.*?)";'.format(subtype), interval[-1])
            site_types.append('{} {}'.format(interval[8],
                                             stype.group(1) if stype else '.'))
        else:
            site_types.append(interval[8])
        previous_interval = interval
    finalize(site_types, previous_interval)

    # Produce annotated cross-link file:
    LOGGER.info('Writing results to file...')
    # To save with .gz compression, file has to first be saved to tmp_dir and
    # saved to filename with .gz extension later. Otherwise, file has gzip
    # extension, but it is not actually gzipped.
    tmp_ann = pybedtools.BedTool(line for line in data).saveas()
    tmp_ann.saveas(sites_annotated)

    LOGGER.info('Done. Output saved to: %s', os.path.abspath(sites_annotated))
    return metrics
コード例 #25
0
ファイル: star.py プロジェクト: nebo56ucl/iCount
def map_reads(reads,
              genome_index,
              out_dir,
              annotation='',
              multimax=10,
              mismatches=2,
              threads=1):
    """
    Map FASTQ file reads to reference genome.

    TODO

    Parameters
    ----------
    reads : str
        Sequencing reads to map to genome.
    genome_index : str
        Folder with genome index.
    out_dir : str
        Output folder, where to store mapping results.
    annotation : str
        GTF annotation needed for mapping to splice-junctions.
    multimax : int
        Number of allowed multiple hits.
    mismatches : int
        Number of allowed mismatches.
    threads : int
        Number of threads that STAR can use for generating index.

    Returns
    -------
    int
        Return code
    """
    iCount.log_inputs(LOGGER, level=logging.INFO)

    if not os.path.isdir(genome_index):
        raise FileNotFoundError(
            'Directory with genome index does not exist. Make sure it does.')
    if not os.path.isdir(out_dir):
        raise FileNotFoundError(
            'Output directory does not exist. Make sure it does.')

    LOGGER.info('Mapping reads from %s', reads)
    sequences_fname2 = iCount.files.decompress_to_tempfile(reads, 'starmap')

    args = [
        'STAR',
        '--runMode',
        'alignReads',
        '--runThreadN',
        '{:d}'.format(threads),
        '--genomeDir',
        '{:s}'.format(genome_index),
        '--readFilesIn',
        '{:s}'.format(sequences_fname2),
    ]
    if not out_dir.endswith('/'):
        out_dir += '/'

    args.extend([
        '--outFileNamePrefix',
        '{:s}'.format(out_dir),
        '--outSAMprimaryFlag',
        'AllBestScore',
        '--outFilterMultimapNmax',
        '{:d}'.format(multimax),
        '--outFilterMismatchNmax',
        '{:d}'.format(mismatches),
        '--alignEndsType',
        'EndToEnd',
        # otherwise soft-clipping of the starts and ends may produce too
        # many multiple hits
        '--outSAMtype',
        'BAM',
        'SortedByCoordinate',
        '--outSAMunmapped',
        'Within',
        'KeepPairs',
    ])
    if annotation:
        annotation2 = iCount.files.decompress_to_tempfile(
            annotation, 'starmap')
        args.extend([
            '--sjdbGTFfile',
            annotation2,
        ])
    else:
        annotation2 = annotation

    try:
        ret_code = 1
        for name, value in _execute(args):
            if name == 'return_code' and isinstance(value, int):
                ret_code = value
                break
            elif name == 'stdout_line':
                LOGGER.info(value.strip())
            elif name == 'stderr_lines':
                for line in value.split('\n'):
                    LOGGER.error(line.strip())
    finally:
        # remove temporary decompressed files
        if reads != sequences_fname2:
            os.remove(sequences_fname2)
        if annotation != annotation2:
            os.remove(annotation2)

    LOGGER.info('Done.')
    return ret_code
コード例 #26
0
ファイル: star.py プロジェクト: nebo56ucl/iCount
def build_index(genome,
                genome_index,
                annotation='',
                overhang=100,
                overhang_min=8,
                threads=1):
    """
    Call STAR to generate genome index, which is used for mapping.

    Parameters
    ----------
    genome : str
        Genome sequence to index.
    genome_index : str
        Output folder, where to store genome index.
    annotation : str
        Annotation that defines splice junctions.
    overhang : int
        Sequence length around annotated junctions to be used by STAR when
        constructing splice junction database.
    overhang_min : int
        TODO
    threads : int
        Number of threads that STAR can use for generating index.

    Returns
    -------
    int
        Star return code.

    """
    iCount.log_inputs(LOGGER, level=logging.INFO)

    if not os.path.isdir(genome_index):
        raise FileNotFoundError(
            'Output directory does not exist. Make sure it does.')

    LOGGER.info('Building genome index with STAR for genome %s', genome)
    genome_fname2 = iCount.files.decompress_to_tempfile(genome, 'starindex')
    args = [
        'STAR',
        '--runThreadN',
        '{:d}'.format(threads),
        '--runMode',
        'genomeGenerate',
        '--genomeDir',
        '{:s}'.format(genome_index),
        '--genomeFastaFiles',
        '{:s}'.format(genome_fname2),
        '--alignSJoverhangMin',
        '{:d}'.format(overhang_min),
        '--outFileNamePrefix',
        '{:s}'.format(genome_index),
    ]
    if annotation:
        annotation2 = iCount.files.decompress_to_tempfile(
            annotation, 'starindex')
        args.extend([
            '--sjdbGTFfile',
            annotation2,
            '--sjdbOverhang',
            '{:d}'.format(overhang),
        ])
    else:
        annotation2 = annotation

    try:
        ret_code = 1
        for name, value in _execute(args):
            if name == 'return_code' and isinstance(value, int):
                ret_code = value
                break
            elif name == 'stdout_line':
                LOGGER.info(value.strip())
            elif name == 'stderr_lines':
                for line in value.split('\n'):
                    LOGGER.error(line.strip())
    finally:
        # remove temporary decompressed files
        if genome != genome_fname2:
            os.remove(genome_fname2)
        if annotation != annotation2:
            os.remove(annotation2)

    LOGGER.info('Done.')
    return ret_code