Ejemplo n.º 1
0
Archivo: run.py Proyecto: dbrrt/pisces
import os
import logging
import pisces
import time
import socket
from multiprocessing import Process
from pisces import find_data_directory, long_substr
from itertools import chain
from pkg_resources import get_distribution

__version__ = get_distribution("novartis_pisces").version

data_dir = find_data_directory()

def run(args, unknown_args):
    """ Main user script to run the PISCES workflow """
    if not args.out:
        output_dir = os.path.join(
            os.path.dirname(os.path.abspath(args.fq1[0])), 'PISCES')
    else:
        output_dir = os.path.abspath(args.out)

    if os.path.exists(output_dir) and not args.overwrite:
        msg = "PISCES output directory %s already exists!" % output_dir
        raise IOError(msg)
    elif os.path.exists(output_dir) and args.overwrite:
        msg = "PISCES output directory %s already exists. Overwriting..." % output_dir
        #shutil.rmtree(output_dir)
        #os.makedirs(output_dir)
    else:
        msg = "Creating PISCES output directory %s." % output_dir
Ejemplo n.º 2
0
def build_index(args, unknown_args):
    from pyfaidx import Fasta
    from twobitreader import TwoBitFile
    import gffutils
    import gffutils.merge_criteria as mc
    import atexit
    import shutil
    from tqdm import tqdm
    from collections import defaultdict
    from pprint import pprint
    from tempfile import NamedTemporaryFile
    from urllib.parse import urlparse
    from urllib.request import urlopen
    from shutil import copyfileobj
    from subprocess import Popen, PIPE, call

    if args.debug:
        log_level = logging.DEBUG
    else:
        log_level = logging.INFO
    logging.basicConfig(
        level=log_level,
        format='%(asctime)s %(levelname)-8s %(message)s',
        datefmt='%m-%d %H:%M')

    config = args.conf
    logging.info("PISCES version %s", __version__)

    for species, datasets in list(config.items()):
        indices = datasets["index"]
        download_dir = datasets["downloads"]
        index_dir_base = datasets["index_dir"]
        if not os.path.exists(download_dir):
            os.makedirs(download_dir)
        for index_name, dataset in list(indices.items()):
            if args.indices and index_name not in args.indices:
                continue
            pprint(dataset, indent=4)
            options = defaultdict(lambda: True)
            options.update(dataset["options"])
            index_dir_path = os.path.join(index_dir_base, species, index_name)
            if os.path.exists(index_dir_path):
                if args.overwrite:
                    logging.warn(
                        "index directory %s already exists! overwriting",
                        index_dir_path)
                    shutil.rmtree(
                        os.path.join(index_dir_path, "transcripts"))
                    shutil.rmtree(os.path.join(index_dir_path, "salmon"))
                else:
                    continue
            os.makedirs(os.path.join(index_dir_path, "transcripts"))
            os.makedirs(os.path.join(index_dir_path, "salmon"))

            transcripts_fasta_file = os.path.join(
                index_dir_path, "transcripts", "transcripts.fa")
                
            with open(transcripts_fasta_file, 'w') as transcripts_fasta:
                ## all of this URI handling should probably use an existing library like
                ## https://github.com/intake/filesystem_spec
                for fasta_loc in dataset["extra_fastas"]:
                    fasta = urlparse(fasta_loc)
                    if fasta.scheme == '':
                        reference = Fasta(fasta.path)
                    elif fasta.scheme.lower() in ('ftp', 'http', 'https'):
                        _fasta_local_path = os.path.join(
                            download_dir, os.path.basename(fasta.path))
                        logging.info("Downloading %s", fasta.geturl())
                        if not os.path.exists(_fasta_local_path):
                            with urlopen(fasta.geturl()) as _fasta:
                                with open(_fasta_local_path,
                                          'wb') as _fasta_local:
                                    copyfileobj(_fasta, _fasta_local)
                                    _fasta_local.flush()
                                    if fasta.path.endswith('gz'):
                                        logging.info("Decompressing %s",
                                                     fasta.geturl())
                                        call(
                                            ' '.join([
                                                'gzip -dc', _fasta_local_path,
                                                '>',
                                                _fasta_local_path.replace(
                                                    ".gz", "")
                                            ]),
                                            shell=True)
                        if _fasta_local_path.endswith("2bit"):
                            logging.info("Converting %s to FASTA format",
                                         fasta.geturl())
                            twobit = TwoBitFile(_fasta_local_path)
                            if not os.path.exists(
                                    _fasta_local_path.replace("2bit", "fa")):
                                with open(
                                        _fasta_local_path.replace(
                                            "2bit", "fa"), 'w') as fasta:
                                    for chrom in twobit.keys():
                                        fasta.write(">%s\n" % chrom)
                                        fasta.write(str(twobit[chrom]) + '\n')
                            reference = Fasta(
                                _fasta_local_path.replace("2bit", "fa"))
                    
                    with open(_fasta_local_path) as extra:
                        logging.info("Adding entries from %s", fasta)
                        for line in extra:
                            transcripts_fasta.write(line)
                            
                for gtf_loc, fasta_loc in zip(dataset["gtfs"],
                                              dataset["fastas"]):
                    gtf = urlparse(gtf_loc)
                    fasta = urlparse(fasta_loc)
                    assembly = os.path.basename(fasta.path)

                    if fasta.scheme == '':
                        reference = Fasta(fasta.path)
                    elif fasta.scheme.lower() in ('ftp', 'http', 'https'):
                        _fasta_local_path = os.path.join(
                            download_dir, os.path.basename(fasta.path))
                        logging.info("Downloading %s", fasta.geturl())
                        if not os.path.exists(_fasta_local_path):
                            with urlopen(fasta.geturl()) as _fasta:
                                with open(_fasta_local_path,
                                          'wb') as _fasta_local:
                                    copyfileobj(_fasta, _fasta_local)
                                    _fasta_local.flush()
                                    if fasta.path.endswith('gz'):
                                        logging.info("Decompressing %s",
                                                     fasta.geturl())
                                        call(
                                            ' '.join([
                                                'gzip -dc', _fasta_local_path,
                                                '>',
                                                _fasta_local_path.replace(
                                                    ".gz", "")
                                            ]),
                                            shell=True)
                        if _fasta_local_path.endswith("2bit"):
                            logging.info("Converting %s to FASTA format",
                                         fasta.geturl())
                            twobit = TwoBitFile(_fasta_local_path)
                            if not os.path.exists(
                                    _fasta_local_path.replace("2bit", "fa")):
                                with open(
                                        _fasta_local_path.replace(
                                            "2bit", "fa"), 'w') as fasta:
                                    for chrom in twobit.keys():
                                        fasta.write(">%s\n" % chrom)
                                        fasta.write(str(twobit[chrom]) + '\n')
                            reference = Fasta(
                                _fasta_local_path.replace("2bit", "fa"))
                        elif fasta.path.endswith('gz'):
                            reference = Fasta(
                                _fasta_local_path.replace(".gz", ""))
                        else:
                            reference = Fasta(_fasta_local_path)

                    if gtf.scheme == '':
                        database_filename = gtf.path + '.db'
                        if os.path.exists(database_filename):
                            logging.info("Loading existing GTF database file.")
                            db = gffutils.FeatureDB(database_filename)
                        else:
                            logging.info(
                                "Creating GTF database file. This will take some time..."
                            )
                            try:
                                db = gffutils.create_db(
                                    gtf.path,
                                    database_filename,
                                    disable_infer_genes=
                                    not options["infer_features"],
                                    disable_infer_transcripts=
                                    not options["infer_features"])
                            except:
                                tmp_db = os.path.join(download_dir, os.path.basename(gtf.path) + '.db')
                                logging.info(
                                    "Unable to create %s, so using %s",
                                    database_filename, tmp_db)
                                if os.path.exists(tmp_db):
                                    logging.info("Loading existing GTF database file.")
                                    db = gffutils.FeatureDB(tmp_db)
                                else:
                                    db = gffutils.create_db(
                                        gtf.path,
                                        tmp_db,
                                        disable_infer_genes=
                                        not options["infer_features"],
                                        disable_infer_transcripts=
                                        not options["infer_features"])
                    elif gtf.scheme.lower() in ('ftp', 'http', 'https'):
                        _gtf_local_path = os.path.join(download_dir,
                                                       os.path.basename(
                                                           gtf.path))
                        logging.info("Downloading %s", gtf.geturl())
                        if not os.path.exists(_gtf_local_path):
                            with urlopen(gtf.geturl()) as _gtf:
                                with open(_gtf_local_path, 'wb') as _gtf_local:
                                    copyfileobj(_gtf, _gtf_local)
                                    _gtf_local.flush()
                                    if gtf.path.endswith('gz'):
                                        logging.info("Decompressing %s",
                                                     gtf.geturl())
                                        call(
                                            ' '.join([
                                                'gzip -dc', _gtf_local_path,
                                                '>',
                                                _gtf_local_path.replace(
                                                    ".gz", "")
                                            ]),
                                            shell=True)
                                        logging.info(
                                            "Creating GTF database file. This will take some time..."
                                        )
                                        db = gffutils.create_db(
                                            _gtf_local_path.replace(".gz", ""),
                                            _gtf_local_path.replace(
                                                ".gz", "") + '.db',
                                            disable_infer_genes=
                                            not options["infer_features"],
                                            disable_infer_transcripts=
                                            not options["infer_features"])
                                    else:
                                        logging.info(
                                            "Creating GTF database file. This will take some time..."
                                        )
                                        db = gffutils.create_db(
                                            _gtf_local_path,
                                            _gtf_local_path + '.db',
                                            disable_infer_genes=
                                            not options["infer_features"],
                                            disable_infer_transcripts=
                                            not options["infer_features"])
                        elif gtf.path.endswith('gz'):
                            logging.info("Loading existing GTF database file.")
                            db = gffutils.FeatureDB(
                                _gtf_local_path.replace(".gz", "") + '.db')
                        else:
                            logging.info("Loading existing GTF database file.")
                            db = gffutils.FeatureDB(_gtf_local_path)

                    # https://github.com/daler/gffutils/issues/56
                    db.execute('ANALYZE features')
                    #if db.count_features_of_type('intron') == 0 and options["unprocessed_transcripts"]:
                        #logging.info("Inferring intronic sequences...")
                        #db.update(db.create_introns())
                    soft_chars = set(('a', 'c', 'g', 't'))

                    if not options["-k"]:
                        k = 31
                    else:
                        k = options["-k"]

                    gene_tx_file = os.path.join(
                        index_dir_path,
                        assembly + "_transcripts_to_genes.txt")
                    gene_annotation = os.path.join(
                        index_dir_path,
                        assembly + "_gene_annotation.txt")
                        
                    def features_to_string(features, fasta_in, masked=True, strand=True):
                        """ 
                        """
                        sequences = []
                        feature_strand = "."
                        for feature in features:
                            feature_strand = feature.strand
                            sequences.append(
                                feature.sequence(
                                    fasta_in, use_strand=strand))
                        # if the transcript is on the reverse strand, reverse order of exons 
                        # before concatenating
                        if feature_strand == "-":
                            sequences = sequences[::-1]
                        seq = ''.join(sequences)
                        mask_count = sum(seq.count(a) for a in soft_chars)
                        if masked:
                            if mask_count > 0:
                                seq = seq.replace(
                                    'a', 'N').replace('t', 'N').replace(
                                        'c', 'N').replace('g', 'N')
                        try:
                            frac_masked = mask_count / len(seq)
                        except ZeroDivisionError:
                            frac_masked = 0
                        return (seq, frac_masked)

                    with open(gene_tx_file, 'w') as gene2tx, open(
                            gene_annotation, 'w') as annotation:
                        logging.info("Making transcripts_to_genes, annotation and FASTA file for %s",
                                     gtf.path)
                        with tqdm(
                                total=db.count_features_of_type('gene'),
                                unit='gene') as pbar:
                            for gene in db.features_of_type('gene'):
                                first_exon = next(
                                    db.children(
                                        gene,
                                        featuretype='exon',
                                        order_by='start'))
                                try:
                                    if options["gene_type"] == True:
                                        type_tag = "gene_type"
                                    else:
                                        type_tag = options["gene_type"]
                                    gene_type = first_exon[type_tag][0]
                                except KeyError:
                                    logging.info("No gene type tag found for %s", gene['gene_id'][0])
                                    gene_type = 'NA'
                                try:
                                    if options["gene_name"] == True:
                                        name_tag = "gene_name"
                                    else:
                                        name_tag = options["gene_name"]
                                    gene_name = first_exon[name_tag][0]
                                except KeyError:
                                    logging.info("No gene name tag found for %s", gene['gene_id'][0])
                                    gene_name = 'NA'
                                    
                                transcripts = db.children(gene, featuretype='transcript', order_by='start')
                                for transcript in transcripts:
                                    # Write entry in the transcripts to genes table
                                    gene2tx.write("{txp}\t{gene}\n".format(
                                        gene=gene['gene_id'][0],
                                        txp=transcript['transcript_id'][0]))
                                    # Construct the transcript sequences and write them to the FASTA
                                    fa_seq, frac_masked = features_to_string(db.children(transcript, 
                                                                                   featuretype='exon', 
                                                                                   order_by='start'), 
                                                                       reference, 
                                                                       masked=options["masked"])
                                    transcripts_fasta.write('>' + transcript['transcript_id'][0] + '\n')
                                    transcripts_fasta.write(fa_seq + '\n')
                                    
                                exons = db.children(gene, featuretype='exon', order_by='start') 
                                merged_exons = db.merge(exons, merge_criteria=(mc.seqid, mc.feature_type, mc.overlap_any_inclusive))
                                if options["unprocessed_transcripts"]:
                                    introns = db.interfeatures(merged_exons, new_featuretype='intron')                                                     
                                    transcripts_fasta.write('>' + "intronic_" + gene['gene_id'][0] + '\n')
                                    fa_seq, _ = features_to_string(introns, reference, masked=options["masked"])
                                    transcripts_fasta.write(fa_seq + '\n')
                                
                                annotation.write(
                                    "{gene}\t{type}\t{name}\t{chrom}\t{start}\t{stop}\t{length}\t{frac_masked}\n".
                                    format(
                                        gene=gene['gene_id'][0],
                                        type=gene_type,
                                        name=gene_name,
                                        start=gene.start,
                                        stop=gene.stop,
                                        chrom=gene.chrom,
                                        length=sum(len(exon) for exon in merged_exons),
                                        frac_masked=str(frac_masked)))
                                        
                                transcripts = db.children(
                                    gene,
                                    featuretype='transcript',
                                    order_by='start')
                                pbar.update(1)
                            
                    if options["intergenes"]:
                        for seqid in reference.keys():
                            logging.info("Merging overlapping genes on %s", seqid)
                            merged_genes = db.merge(db.region(seqid=seqid), merge_criteria=(mc.seqid, mc.feature_type, mc.overlap_any_inclusive))
                            with tqdm(unit='intergene features') as pbar:
                                for intergene in db.interfeatures(merged_genes, new_featuretype='intergene'):
                                    transcripts_fasta.write('>' + 'intergene_' + seqid + "_" + str(intergene.start) + ':' + str(intergene.end) + '\n')
                                    fa_seq, _ = features_to_string([intergene], reference, masked=options["masked"], strand=False)
                                    transcripts_fasta.write(fa_seq + '\n')
                                    pbar.update(1)


            # This needs to happen outside of context handler so FASTA file can be closed properly
            logging.info("Making salmon index files for %s",
                         species + '/' + index_name)
            cmd = [
                os.path.join(find_data_directory(), 'redist', 'salmon',
                             'bin', 'salmon'), 'index', '-p',
                str(args.threads), '-k',
                str(k), '-t', transcripts_fasta.name, '-i',
                os.path.join(index_dir_path, "salmon")
            ]
            logging.debug(' '.join(cmd))
            p = Popen(cmd, stderr=PIPE)
            for line in p.stderr:
                line = line.decode()
                if line.endswith('\n'):
                    logging.info(line.rstrip())
                else:
                    logging.info(line)
            logging.info(line)