Example #1
0
 def remove_file_node_elevation(self, ):
     ''' Need lock for caller!'''
     self._osm_file = self.get_osm_file_path()
     tmpf = self._osm_file.rstrip(".osm") + "-nid-to-elevation.txt"
     if os.path.exists(tmpf):
         if iprint >= 2: print("remove", tmpf, tmpf + ".old")
         movefile(tmpf, tmpf + ".old")
Example #2
0
def add_file(logger: Logger, filename: str, verbose: str = "info"):
    """
    set up file handler to the logger with handlers

    :param logger: the logger
    :param filename: name of the logfile
    :type filename: str
    :param verbose: verbose level
    :type verbose: str
    """

    file_defined = False
    for handler in logger.handlers:
        if isinstance(handler, FileHandler):
            file_defined = True

    if not file_defined:

        # back up
        if isfile(filename):
            movefile(filename, filename + "-bak")

        fh = FileHandler(filename)
        verbose = getattr(logging, verbose.upper())
        logger.setLevel(verbose)
        fh.setLevel(logging.DEBUG)
        logger.addHandler(fh)
Example #3
0
def _copyIfNewerVisit(args, sourceDirname, names):

    prefixLen = args[0]
    destDirname = args[1] + sourceDirname[prefixLen:]
    move = args[2]
    dirName = _basename(destDirname)

    if (excludeFromCopying.search(dirName) != None):
        # Don't recurse into subdirectories
        del names[:]
        return

    # Create the corresponding destination dir if necessary
    mkdir(destDirname)

    # Iterate through the contents of this directory
    for name in names:
        source = sourceDirname + '/' + name

        if ((excludeFromCopying.search(name) == None) and \
            (not os.path.isdir(source))):
            # Copy files if newer
            dest = destDirname + '/' + name
            if (newer(source, dest)):
                if move:
                    print 'mv ' + source + ' ' + dest
                    shutil.movefile(source, dest)
                else:
                    print 'cp ' + source + ' ' + dest
                    shutil.copyfile(source, dest)
Example #4
0
 def __init__(self, tag=""):
     self.lg_index = 0
     self.my_ip = get_my_ip()
     if not os.path.exists(folder):
         os.makedirs(folder)
     try:
         self.my_tname = tag + ip2tarekc[self.my_ip]
     except:
         self.my_tname = tag + self.my_ip.split(".", 2)[-1]
         if iprint >= 2: print(self.my_tname)
     self.fd_list = []
     self.fnames = []
     self.freshness = 0  # latest log always named 0: log-tarekc-`0`-date-txt
     tmp = glob.glob(folder + "log*")  # log/log*
     for fn in tmp:
         try:
             st = fn.replace(fnamePrefix + self.my_tname + "-",
                             "").split("-", 1)
             ind = int(st[0]) + 1
             newfn = fnamePrefix + self.my_tname + "-%d-" % ind + st[-1]
             movefile(fn, newfn)
             if iprint >= 2: print(fn, newfn)
         except:
             pass
     fmain = fnamePrefix + self.my_tname + "-0-" + datetime.datetime.now(
     ).strftime("%Y-%m-%d-%H-%M-%S") + ".txt"
     if iprint >= 1: print("create", fmain)
     fd = open(fmain, "w")
     self.fd_list.append(fd)
     self.fnames.append(fmain)
     self.lg(self.my_tname)
     self.lg(self.my_ip)
     self.lg(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
     self.lg(time.time())
     self.lg("\n")
 def copyOrMoveFileToDestination(self, move):
     src = os.path.join(self.downloadedFileDirectory,
                        self.downloadedFileName)
     destDir = os.path.join(self.destinationSeriesPath,
                            'Season ' + str(self.seasonNo))
     if not os.path.exists(destDir):
         os.makedirs(destDir)
     pattern, epString = '{}E{:0>2d}', ''
     for ep in self.episodeNos:
         epString = pattern.format(epString, ep)
     destFileName = '{}.S{:0>2d}{}{}'.format(
         self.tokenizedFileName, self.seasonNo, epString,
         os.path.splitext(self.downloadedFileName)[1])
     dest = os.path.join(destDir, destFileName)
     if os.path.exists(dest):
         os.remove(dest)
     if move:
         self.createLocalCopy(src)
         print 'Moving from.....' + src + '.....' + dest
         movefile(src, dest)
         print '>>> Move complete...' + dest
     else:
         print 'Copying from.....' + src + '.....' + dest
         copyfile(src, dest)
         print '>>> Copy complete...' + dest
Example #6
0
def copyIfNewer(source, dest, move=0):

    if source == dest:
        # Copying in place
        return

    dest = removeTrailingSlash(dest)

    if (not os.path.exists(source)):
        # Source does not exist
        return

    if (not os.path.isdir(source) and newer(source, dest)):
        if (move):
            print 'mv ' + source + ' ' + dest
            shutil.movefile(source, dest)
        else:
            print 'cp ' + source + ' ' + dest
            shutil.copyfile(source, dest)

    else:

        # Walk is a special iterator that visits all of the
        # children and executes the 2nd argument on them.

        os.path.walk(source, _copyIfNewerVisit, [len(source), dest, move])
Example #7
0
def _copyIfNewerVisit(args, sourceDirname, names):

    prefixLen   = args[0]
    destDirname = args[1] + sourceDirname[prefixLen:]
    move        = args[2]
    dirName     = _basename(destDirname)

    if (excludeFromCopying.search(dirName) != None):
        # Don't recurse into subdirectories
        del names[:]
        return

    # Create the corresponding destination dir if necessary
    mkdir(destDirname)

    # Iterate through the contents of this directory   
    for name in names:
        source = sourceDirname + '/' + name

        if ((excludeFromCopying.search(name) == None) and \
            (not os.path.isdir(source))):
            # Copy files if newer
            dest = destDirname + '/' + name
            if (newer(source, dest)):
                if move:
                    print 'mv ' + source + ' ' + dest
                    shutil.movefile(source, dest)
                else:
                    print 'cp ' + source + ' ' + dest
                    shutil.copyfile(source, dest)
Example #8
0
def copyIfNewer(source, dest, move = 0):

    if source == dest:
        # Copying in place
        return

    dest = removeTrailingSlash(dest)

    if (not os.path.exists(source)):
        # Source does not exist
        return

    if (not os.path.isdir(source) and newer(source, dest)):
        if (move):
            print 'mv ' + source + ' ' + dest
            shutil.movefile(source, dest)
        else:
            print 'cp ' + source + ' ' + dest
            shutil.copyfile(source, dest)
        
    else:

        # Walk is a special iterator that visits all of the
        # children and executes the 2nd argument on them.  

        os.path.walk(source, _copyIfNewerVisit, [len(source), dest, move])
Example #9
0
 def remove_file_way_speed(self, ):
     ''' Need lock for caller!'''
     self._osm_file = self.get_osm_file_path()
     tmpf = self._osm_file.rstrip(".osm") + "-nids-to-speed.txt"
     if os.path.exists(tmpf):
         if iprint >= 2: print("remove", tmpf, tmpf + ".old")
         movefile(tmpf, tmpf + ".old")
     tmpf = self.data_dir + os.sep + "cache-%s-nids-to-speed.txt" % self._address_no_space
     if os.path.exists(tmpf):
         if iprint >= 2: print("remove", tmpf)
         os.remove(tmpf)
     tmpf = self.data_dir + os.sep + "COMPLETE-way-speed"  # defined in 3genOsmCache.py
     if os.path.exists(tmpf):
         if iprint >= 2: print("remove", tmpf)
         os.remove(tmpf)
Example #10
0
def movefile(src_fs, src_path, dst_fs, dst_path, chunk_size=16384):
    """Move a file from one filesystem to another. Will use system copyfile, if both files have a syspath.
    Otherwise file will be copied a chunk at a time.

    src_fs -- Source filesystem object
    src_path -- Source path
    dst_fs -- Destination filesystem object
    dst_path -- Destination filesystem object
    chunk_size -- Size of chunks to move if system copyfile is not available (default 16K)

    """
    src_syspath = src_fs.getsyspath(src_path, default="")
    dst_syspath = dst_fs.getsyspath(dst_path, default="")

    # System copy if there are two sys paths
    if src_syspath and dst_syspath:
        shutil.movefile(src_syspath, dst_syspath)
        return

    src, dst = None

    try:
        # Chunk copy
        src = src_fs.open(src_path, 'rb')
        dst = dst_fs.open(dst_path, 'wb')

        while True:
            chunk = src.read(chunk_size)
            if not chunk:
                break
            dst.write(chunk)

        src_fs.remove(src)

    finally:
        if src is not None:
            src.close()
        if dst is not None:
            dst.close()
Example #11
0
def movefile(src_fs, src_path, dst_fs, dst_path, chunk_size=16384):
    """Move a file from one filesystem to another. Will use system copyfile, if both files have a syspath.
    Otherwise file will be copied a chunk at a time.

    src_fs -- Source filesystem object
    src_path -- Source path
    dst_fs -- Destination filesystem object
    dst_path -- Destination filesystem object
    chunk_size -- Size of chunks to move if system copyfile is not available (default 16K)

    """
    src_syspath = src_fs.getsyspath(src_path, default="")
    dst_syspath = dst_fs.getsyspath(dst_path, default="")

    # System copy if there are two sys paths
    if src_syspath and dst_syspath:
        shutil.movefile(src_syspath, dst_syspath)
        return

    src, dst = None

    try:
        # Chunk copy
        src = src_fs.open(src_path, 'rb')
        dst = dst_fs.open(dst_path, 'wb')

        while True:
            chunk = src.read(chunk_size)
            if not chunk:
                break
            dst.write(chunk)

        src_fs.remove(src)

    finally:
        if src is not None:
            src.close()
        if dst is not None:
            dst.close()
Example #12
0
File: run.py Project: dpeerlab/seqc
    def create_read_array(bamfile, index, aws_upload_key, min_poly_t,
                          max_transcript_length):
        """Create or download a ReadArray object.

        :param max_transcript_length:
        :param str bamfile: filename of .bam file
        :param str index: directory containing index files
        :param str aws_upload_key: key where aws files should be uploaded
        :param int min_poly_t: minimum number of poly_t nucleotides for a read to be valid
        :returns ReadArray, UploadManager: ReadArray object, bamfile ProcessManager
        """
        log.info("Filtering aligned records and constructing record database.")
        # Construct translator
        translator = GeneIntervals(index + "annotations.gtf",
                                   max_transcript_length=max_transcript_length)
        read_array, read_names = ReadArray.from_alignment_file(
            bamfile, translator, min_poly_t)

        # converting sam to bam and uploading to S3, else removing bamfile
        if aws_upload_key:
            log.info("Uploading bam file to S3.")
            upload_bam = "aws s3 mv {fname} {s3link}{prefix}_Aligned.out.bam".format(
                fname=bamfile,
                s3link=aws_upload_key,
                prefix=args.output_prefix)
            print(upload_bam)
            upload_manager = io.ProcessManager(upload_bam)
            upload_manager.run_all()
        else:
            if os.path.exists(bamfile):
                movefile(bamfile, args.output_prefix + "_Aligned.out.bam")
            #     log.info('Removing bamfile for memory management.')
            #     rm_bamfile = 'rm %s' % bamfile
            #     io.ProcessManager(rm_bamfile).run_all()
            upload_manager = None
        return read_array, upload_manager, read_names
Example #13
0
File: run.py Project: dpeerlab/seqc
def run(args) -> None:
    """Run SEQC on the files provided in args, given specifications provided on the
    command line

    :param args: parsed argv, produced by seqc.parser(). This function is only called
      when args.subprocess_name is "run".
    """

    # import inside module for pickle functionality
    # top 2 only needed for post-filtering

    import os
    import multiprocessing
    from seqc import log, ec2, platforms, io, version
    from seqc.sequence import fastq
    from seqc.alignment import star
    from seqc.alignment import sam
    from seqc.email_ import email_user
    from seqc.read_array import ReadArray
    from seqc.core import verify, download
    from seqc import filter
    from seqc.sequence.gtf import GeneIntervals
    from seqc.summary.summary import Section, Summary
    import numpy as np
    import scipy.io
    from shutil import copyfile
    from shutil import move as movefile
    from seqc.summary.summary import MiniSummary
    from seqc.stats.mast import run_mast
    import logging
    import pickle
    import pendulum

    # logger = logging.getLogger('weasyprint')
    # logger.handlers = []  # Remove the default stderr handler
    # logger.setLevel(100)
    # logger.addHandler(logging.FileHandler('weasyprint.log'))

    def determine_start_point(arguments) -> (bool, bool, bool):
        """
        determine where seqc should start based on which parameters were passed.

        :param arguments: Namespace object, result of ArgumentParser.parse_args()
        :returns merge, align, process_bamfile: indicates whether merging, alignment, and
          processing bamfiles should be executed.
        """
        if arguments.read_array:
            return False, False, False
        if arguments.alignment_file:
            return False, False, True
        if arguments.merged_fastq:
            return False, True, True
        else:
            return True, True, True

    def download_input(dir_, arguments):
        """parse input arguments and download any necessary data

        :param str dir_: directory to download data to
        :param arguments: namespace object from argparse
        :return args: updated namespace object reflecting local file paths of downloaded
          files
        """
        # download basespace data if necessary
        if arguments.basespace:
            arguments.barcode_fastq, arguments.genomic_fastq = io.BaseSpace.download(
                arguments.platform, arguments.basespace, dir_,
                arguments.basespace_token)

        # get a list of input FASTQ files
        # download from AWS S3 if the URI is prefixed with s3://
        arguments.genomic_fastq = download.s3_data(arguments.genomic_fastq,
                                                   dir_ + "/genomic_fastq/")
        arguments.barcode_fastq = download.s3_data(arguments.barcode_fastq,
                                                   dir_ + "/barcode_fastq/")

        # get merged fastq file, unzip if necessary
        arguments.merged_fastq = (download.s3_data(
            [arguments.merged_fastq], dir_ +
            "/")[0] if arguments.merged_fastq is not None else None)

        # get a path to the STAR index files
        # download from AWS S3 if the URI is prefixed with s3://
        if any((arguments.alignment_file, arguments.read_array)):
            index_link = arguments.index + "annotations.gtf"
        else:
            index_link = arguments.index
        index_files = download.s3_data([index_link], dir_ + "/index/")
        # use the first filename in the list to get the index directory
        # add a trailing slash to make the rest of the code not break;;
        # e.g. test-data/index/chrStart.txt --> test-data/index/
        arguments.index = os.path.dirname(index_files[0]) + "/"

        # get a list of whitelisted barcodes files
        # download from AWS S3 if the URI is prefixed with s3://
        arguments.barcode_files = download.s3_data(arguments.barcode_files,
                                                   dir_ + "/barcodes/")

        # check if `alignment_file` is specified
        if arguments.alignment_file:
            # get the alignment filename (*.bam)
            # download from AWS S3 if the URI is prefixed with s3://
            arguments.alignment_file = download.s3_data(
                [arguments.alignment_file], dir_ + "/")[0]

        # check if `read_array` is specified
        if arguments.read_array:
            # get the readarray fileanem (*.h5)
            # download from AWS S3 if the URI is prefixed with s3://
            arguments.read_array = download.s3_data([arguments.read_array],
                                                    dir_ + "/")[0]

        return arguments

    def merge_fastq_files(
        technology_platform,
        barcode_fastq: [str],
        output_stem: str,
        genomic_fastq: [str],
    ) -> (str, int):
        """annotates genomic fastq with barcode information; merging the two files.

        :param technology_platform: class from platforms.py that defines the
          characteristics of the data being processed
        :param barcode_fastq: list of str names of fastq files containing barcode
          information
        :param output_stem: str, stem for output files
        :param genomic_fastq: list of str names of fastq files containing genomic
          information
        :returns str merged_fastq: name of merged fastq file
        """

        # hack:
        # Due to the non-platform agnostic glob behavior,
        # it is possible that L001_R1 is merged with L002_R2 (not L001_R2).
        # to avoid this problem, we first sort.
        # this is a temporary hacky solution
        barcode_fastq = sorted(barcode_fastq)
        genomic_fastq = sorted(genomic_fastq)

        log.info("Merging genomic reads and barcode annotations.")
        for bar_fq, gen_fq in zip(barcode_fastq, genomic_fastq):
            log.info("Merge {} with {}".format(os.path.basename(bar_fq),
                                               os.path.basename(gen_fq)))

        merged_fastq = fastq.merge_paired(
            merge_function=technology_platform.merge_function,
            fout=output_stem + "_merged.fastq",
            genomic=genomic_fastq,
            barcode=barcode_fastq,
        )

        # delete genomic/barcode fastq files after merged.fastq creation
        # log.info('Removing original fastq file for memory management.')
        # delete_fastq = ' '.join(['rm'] + genomic_fastq + barcode_fastq)
        # io.ProcessManager(delete_fastq).run_all()

        return merged_fastq

    def align_fastq_records(merged_fastq, dir_, star_args, star_index, n_proc,
                            aws_upload_key) -> (str, str, io.ProcessManager):
        """
        Align fastq records.

        :param merged_fastq: str, path to merged .fastq file
        :param dir_: str, stem for output files
        :param star_args: dict, extra keyword arguments for STAR
        :param star_index: str, file path to directory containing STAR index
        :param n_proc: int, number of STAR processes to initiate
        :param aws_upload_key: str, location to upload files, or None if seqc was
          initiated from a merged fastq file.
        :return bamfile, input_data, upload_manager: (str, str, io.ProcessManager)
          name of .sam file containing aligned reads, indicator of which data was used as
          input, and a ProcessManager for merged fastq files
        """
        log.info("Aligning merged fastq records.")
        alignment_directory = dir_ + "/alignments/"
        os.makedirs(alignment_directory, exist_ok=True)
        if star_args is not None:
            star_kwargs = dict(a.strip().split("=") for a in star_args)
        else:
            star_kwargs = {}
        bamfile = star.align(merged_fastq, star_index, n_proc,
                             alignment_directory, **star_kwargs)

        log.info("Gzipping merged fastq file.")
        if pigz:
            pigz_zip = "pigz --best -f {fname}".format(fname=merged_fastq)
        else:
            pigz_zip = "gzip -f {fname}".format(fname=merged_fastq)
        pigz_proc = io.ProcessManager(pigz_zip)
        pigz_proc.run_all()
        pigz_proc.wait_until_complete()  # prevents slowing down STAR alignment
        merged_fastq += ".gz"  # reflect gzipped nature of file

        if aws_upload_key:
            log.info("Uploading gzipped merged fastq file to S3.")
            merge_upload = "aws s3 mv {fname} {s3link}".format(
                fname=merged_fastq, s3link=aws_upload_key)
            upload_manager = io.ProcessManager(merge_upload)
            upload_manager.run_all()
        else:
            #     log.info('Removing merged fastq file for memory management.')
            #     rm_merged = 'rm %s' % merged_fastq
            #     io.ProcessManager(rm_merged).run_all()

            upload_manager = None
        return bamfile, upload_manager

    def create_read_array(bamfile, index, aws_upload_key, min_poly_t,
                          max_transcript_length):
        """Create or download a ReadArray object.

        :param max_transcript_length:
        :param str bamfile: filename of .bam file
        :param str index: directory containing index files
        :param str aws_upload_key: key where aws files should be uploaded
        :param int min_poly_t: minimum number of poly_t nucleotides for a read to be valid
        :returns ReadArray, UploadManager: ReadArray object, bamfile ProcessManager
        """
        log.info("Filtering aligned records and constructing record database.")
        # Construct translator
        translator = GeneIntervals(index + "annotations.gtf",
                                   max_transcript_length=max_transcript_length)
        read_array, read_names = ReadArray.from_alignment_file(
            bamfile, translator, min_poly_t)

        # converting sam to bam and uploading to S3, else removing bamfile
        if aws_upload_key:
            log.info("Uploading bam file to S3.")
            upload_bam = "aws s3 mv {fname} {s3link}{prefix}_Aligned.out.bam".format(
                fname=bamfile,
                s3link=aws_upload_key,
                prefix=args.output_prefix)
            print(upload_bam)
            upload_manager = io.ProcessManager(upload_bam)
            upload_manager.run_all()
        else:
            if os.path.exists(bamfile):
                movefile(bamfile, args.output_prefix + "_Aligned.out.bam")
            #     log.info('Removing bamfile for memory management.')
            #     rm_bamfile = 'rm %s' % bamfile
            #     io.ProcessManager(rm_bamfile).run_all()
            upload_manager = None
        return read_array, upload_manager, read_names

    # ######################## MAIN FUNCTION BEGINS HERE ################################

    log.setup_logger(args.log_name, args.debug)

    with ec2.instance_clean_up(
            email=args.email,
            upload=args.upload_prefix,
            log_name=args.log_name,
            debug=args.debug,
            terminate=args.terminate,
            running_remote=args.remote,
    ):

        start_run_time = pendulum.now()

        log.notify("SEQC=v{}".format(version.__version__))
        log.notify("STAR=v{}".format(star.get_version()))
        log.notify("samtools=v{}".format(sam.get_version()))

        pigz, mutt = verify.executables("pigz", "mutt")
        if mutt:
            log.notify(
                "mutt executable identified, email will be sent when run "
                "terminates. ")
        else:
            log.notify(
                "mutt was not found on this machine; an email will not be sent to "
                "the user upon termination of SEQC run.")

        # turn off lower coverage filter for 10x
        if ((args.platform == "ten_x") or (args.platform == "ten_x_v2")
                or (args.platform == "ten_x_v3")):
            args.filter_low_coverage = False

        if args.platform == "ten_x_v2" or args.platform == "ten_x_v3":
            log.notify("Setting min_poly_t=0 for 10x v2 & v3")
            args.min_poly_t = 0

        max_insert_size = args.max_insert_size
        if args.filter_mode == "scRNA-seq":
            # for scRNA-seq
            if ((args.platform == "ten_x") or (args.platform == "ten_x_v2")
                    or (args.platform == "ten_x_v3")):
                # set max_transcript_length (max_insert_size) = 10000
                max_insert_size = 10000
                log.notify(
                    "Full length transcripts are used for read mapping in 10x data."
                )
        elif args.filter_mode == "snRNA-seq":
            # for snRNA-seq
            # e.g. 2304700 # hg38
            # e.g. 4434881 # mm38
            max_insert_size = args.max_insert_size
        else:
            # all others
            max_insert_size = args.max_insert_size

        log.notify("max_insert_size is set to {}".format(max_insert_size))

        log.args(args)

        # e.g.
        # --output-prefix=test-data/_outs/test
        # output_dir=test-data
        # output_prefix=test
        output_dir, output_prefix = os.path.split(args.output_prefix)
        if not output_dir:
            output_dir = "."
        else:
            os.makedirs(output_dir, exist_ok=True)

        # check if the platform name provided is supported by seqc
        # todo move into verify for run
        platform_name = verify.platform_name(args.platform)
        platform = platforms.AbstractPlatform.factory(
            platform_name)  # returns platform

        n_processes = multiprocessing.cpu_count(
        ) - 1  # get number of processors

        merge, align, process_bamfile = determine_start_point(args)

        args = download_input(output_dir, args)

        if args.platform == "in_drop_v5":
            platform = platform.build_cb2_barcodes(args.barcode_files)
            log.notify("Built cb2 barcode hash for v5 barcodes.")

        if merge:
            if args.min_poly_t is None:  # estimate min_poly_t if it was not provided
                args.min_poly_t = filter.estimate_min_poly_t(
                    args.barcode_fastq, platform)
                log.notify("Estimated min_poly_t={!s}".format(args.min_poly_t))

            args.merged_fastq = merge_fastq_files(platform, args.barcode_fastq,
                                                  args.output_prefix,
                                                  args.genomic_fastq)

        # SEQC was started from input other than fastq files
        if args.min_poly_t is None:
            args.min_poly_t = 0
            log.warn(
                "Warning: SEQC started from step other than unmerged fastq with "
                "empty --min-poly-t parameter. Continuing with --min-poly-t 0."
            )

        if align:
            upload_merged = args.upload_prefix if merge else None
            args.alignment_file, manage_merged = align_fastq_records(
                args.merged_fastq,
                output_dir,
                args.star_args,
                args.index,
                n_processes,
                upload_merged,
            )
        else:
            manage_merged = None

        if process_bamfile:
            # if the starting point was a BAM file (i.e. args.alignment_file=*.bam & align=False)
            # do not upload by setting this to None
            upload_bamfile = args.upload_prefix if align else None

            ra, manage_bamfile, read_names = create_read_array(
                args.alignment_file,
                args.index,
                upload_bamfile,
                args.min_poly_t,
                max_insert_size,
            )
        else:
            manage_bamfile = None
            ra = ReadArray.load(args.read_array)
            # fixme: the old read_array doesn't have read_names
            read_names = None

        # create the first summary section here
        status_filters_section = Section.from_status_filters(
            ra, "initial_filtering.html")
        sections = [status_filters_section]

        # Skip over the corrections if read array is specified by the user
        if not args.read_array:

            # Correct barcodes
            log.info("Correcting barcodes and estimating error rates.")
            error_rate, df_cb_correction = platform.apply_barcode_correction(
                ra, args.barcode_files)
            if df_cb_correction is not None and len(df_cb_correction) > 0:
                df_cb_correction.to_csv(
                    args.output_prefix + "_cb-correction.csv.gz",
                    index=False,
                    compression="gzip",
                )

            # Resolve multimapping
            log.info("Resolving ambiguous alignments.")
            mm_results = ra.resolve_ambiguous_alignments()

            # 121319782799149 / 614086965 / pos=49492038 / AAACATAACG
            # 121319782799149 / 512866590 / pos=49490848 / TCAATTAATC (1 hemming dist away from TCAATTAATT)
            # ra.data["rmt"][91490] = 512866590
            # ra.positions[91490] = 49492038

            # correct errors
            log.info("Identifying RMT errors.")
            df_umi_correction = platform.apply_rmt_correction(ra, error_rate)
            if df_umi_correction is not None and len(df_umi_correction) > 0:
                df_umi_correction.to_csv(
                    args.output_prefix + "_umi-correction.csv.gz",
                    index=False,
                    compression="gzip",
                )

            # Apply low coverage filter
            if platform.filter_lonely_triplets:
                log.info("Filtering lonely triplet reads")
                ra.filter_low_coverage(alpha=args.low_coverage_alpha)

            log.info("Saving read array.")
            ra.save(args.output_prefix + ".h5")

            # generate a file with read_name, corrected cb, corrected umi
            # read_name already has pre-corrected cb & umi
            # log.info("Saving correction information.")
            # ra.create_readname_cb_umi_mapping(
            #     read_names, args.output_prefix + "_correction.csv.gz"
            # )

            # Summary sections
            # create the sections for the summary object
            sections += [
                Section.from_cell_barcode_correction(
                    ra, "cell_barcode_correction.html"),
                Section.from_rmt_correction(ra, "rmt_correction.html"),
                Section.from_resolve_multiple_alignments(
                    mm_results, "multialignment.html"),
            ]

        # create a dictionary to store output parameters
        mini_summary_d = dict()

        # filter non-cells
        log.info("Creating counts matrix.")
        sp_reads, sp_mols = ra.to_count_matrix(sparse_frame=True,
                                               genes_to_symbols=args.index +
                                               "annotations.gtf")

        # generate 10x compatible count matrix
        log.info("Creating 10x compatible counts matrix.")
        ra.to_10x_count_matrix(genes_to_symbols=args.index + "annotations.gtf")

        # Save sparse matrices
        log.info("Saving sparse matrices")
        scipy.io.mmwrite(args.output_prefix + "_sparse_read_counts.mtx",
                         sp_reads.data)
        scipy.io.mmwrite(args.output_prefix + "_sparse_molecule_counts.mtx",
                         sp_mols.data)
        # Indices
        df = np.array([np.arange(sp_reads.shape[0]), sp_reads.index]).T
        np.savetxt(
            args.output_prefix + "_sparse_counts_barcodes.csv",
            df,
            fmt="%d",
            delimiter=",",
        )
        # Columns
        df = np.array([np.arange(sp_reads.shape[1]), sp_reads.columns]).T
        np.savetxt(args.output_prefix + "_sparse_counts_genes.csv",
                   df,
                   fmt="%s",
                   delimiter=",")

        log.info("Creating filtered counts matrix.")
        cell_filter_figure = args.output_prefix + "_cell_filters.png"

        # By pass low count filter for mars seq
        (
            sp_csv,
            total_molecules,
            molecules_lost,
            cells_lost,
            cell_description,
        ) = filter.create_filtered_dense_count_matrix(
            sp_mols,
            sp_reads,
            mini_summary_d,
            plot=True,
            figname=cell_filter_figure,
            filter_low_count=platform.filter_low_count,
            filter_mitochondrial_rna=args.filter_mitochondrial_rna,
            filter_low_coverage=args.filter_low_coverage,
            filter_low_gene_abundance=args.filter_low_gene_abundance,
        )

        # Output files
        files = [
            cell_filter_figure,
            args.output_prefix + ".h5",
            args.output_prefix + "_sparse_read_counts.mtx",
            args.output_prefix + "_sparse_molecule_counts.mtx",
            args.output_prefix + "_sparse_counts_barcodes.csv",
            args.output_prefix + "_sparse_counts_genes.csv",
            "raw_feature_bc_matrix/matrix.mtx.gz",
            "raw_feature_bc_matrix/barcodes.tsv.gz",
            "raw_feature_bc_matrix/features.tsv.gz",
        ]

        if os.path.exists(args.output_prefix + "_cb-correction.csv.gz"):
            files.append(args.output_prefix + "_cb-correction.csv.gz")
        if os.path.exists(args.output_prefix + "_umi-correction.csv.gz"):
            files.append(args.output_prefix + "_umi-correction.csv.gz")

        # Summary sections
        # create the sections for the summary object
        sections += [
            Section.from_cell_filtering(cell_filter_figure,
                                        "cell_filtering.html"),
            Section.from_run_time(args.log_name, "seqc_log.html"),
        ]

        # get alignment summary
        if os.path.isfile(output_dir + "/alignments/Log.final.out"):
            os.rename(
                output_dir + "/alignments/Log.final.out",
                args.output_prefix + "_alignment_summary.txt",
            )

            # Upload files and summary sections
            files += [args.output_prefix + "_alignment_summary.txt"]
            sections.insert(
                0,
                Section.from_alignment_summary(
                    args.output_prefix + "_alignment_summary.txt",
                    "alignment_summary.html",
                ),
            )

        cell_size_figure = args.output_prefix + "_cell_size_distribution.png"
        index_section = Section.from_final_matrix(sp_csv, cell_size_figure,
                                                  "cell_distribution.html")
        seqc_summary = Summary(args.output_prefix + "_summary", sections,
                               index_section)
        seqc_summary.prepare_archive()
        seqc_summary.import_image(cell_filter_figure)
        seqc_summary.import_image(cell_size_figure)
        seqc_summary.render()

        # create a .tar.gz with `test_summary/*`
        summary_archive = seqc_summary.compress_archive()
        files += [summary_archive]

        # Create a mini summary section
        alignment_summary_file = args.output_prefix + "_alignment_summary.txt"
        seqc_mini_summary = MiniSummary(
            output_dir,
            output_prefix,
            mini_summary_d,
            alignment_summary_file,
            cell_filter_figure,
            cell_size_figure,
        )
        seqc_mini_summary.compute_summary_fields(ra, sp_csv)
        seqc_mini_summary_json, seqc_mini_summary_pdf = seqc_mini_summary.render(
        )
        files += [seqc_mini_summary_json, seqc_mini_summary_pdf]

        # Running MAST for differential analysis
        # file storing the list of differentially expressed genes for each cluster
        de_gene_list_file = run_mast(
            seqc_mini_summary.get_counts_filtered(),
            seqc_mini_summary.get_clustering_result(),
            args.output_prefix,
        )
        files += [de_gene_list_file]

        # adding the cluster column and write down gene-cell count matrix
        dense_csv = args.output_prefix + "_dense.csv"
        sp_csv.insert(loc=0,
                      column="CLUSTER",
                      value=seqc_mini_summary.get_clustering_result())
        sp_csv.to_csv(dense_csv)
        files += [dense_csv]

        if args.upload_prefix:
            # Upload count matrices files, logs, and return
            bucket, key = io.S3.split_link(args.upload_prefix)
            for item in files:
                try:
                    ec2.Retry(retries=5)(io.S3.upload_file)(item, bucket, key)
                    item_name = item.split("/")[-1]
                    log.info('Successfully uploaded %s to "%s%s".' %
                             (item, args.upload_prefix, item_name))
                except FileNotFoundError:
                    log.notify(
                        "Item %s was not found! Continuing with upload..." %
                        item)

        if manage_merged:
            manage_merged.wait_until_complete()
            log.info('Successfully uploaded %s to "%s"' %
                     (args.merged_fastq, args.upload_prefix))
        if manage_bamfile:
            manage_bamfile.wait_until_complete()
            log.info('Successfully uploaded %s to "%s"' %
                     (args.alignment_file, args.upload_prefix))

        log.info("SEQC run complete.")

        end_run_time = pendulum.now()
        running_time = end_run_time - start_run_time
        log.info("Running Time={}".format(running_time.in_words()))

        # upload logs
        if args.upload_prefix:
            # upload logs (seqc_log.txt, nohup.log)
            bucket, key = io.S3.split_link(args.upload_prefix)
            for item in [args.log_name, "./nohup.log"]:
                try:
                    # Make a copy of the file with the output prefix
                    copyfile(item, args.output_prefix + "_" + item)
                    print(args.output_prefix + "_" + item)
                    ec2.Retry(retries=5)(io.S3.upload_file)(
                        args.output_prefix + "_" + item, bucket, key)
                    log.info('Successfully uploaded %s to "%s".' %
                             (item, args.upload_prefix))
                except FileNotFoundError:
                    log.notify(
                        "Item %s was not found! Continuing with upload..." %
                        item)
        else:
            # move the log to output directory
            movefile(args.log_name, args.output_prefix + "_" + args.log_name)

        # todo local test does not send this email
        if mutt:
            email_body = (
                '<font face="Courier New, Courier, monospace">'
                "SEQC RUN COMPLETE.\n\n"
                "The run log has been attached to this email and "
                "results are now available in the S3 location you specified: "
                '"%s"\n\n' % args.upload_prefix)
            email_body = email_body.replace("\n",
                                            "<br>").replace("\t", "&emsp;")
            email_user(summary_archive, email_body, args.email)
Example #14
0
def szukaj_w_bazie_i_katalogu(katalog_z, katalog_do):

    #-do poprawki
    conn = sqlite3.connect(DATABASE_FILE)
    cur = conn.cursor()

    kat_przesluchane = PurePosixPath(katalog_z)
    kat_przesluchane_ = str(kat_przesluchane) + str(SEP)

    kat_nieprzesluchane = PurePosixPath(katalog_do)
    kat_nieprzesluchane_ = str(kat_nieprzesluchane) + str(SEP)

    podcast_heard_val = 0

    if (katalog_z == katalog_tok_fm_podcasty_result_dir_przesluchane):
        podcast_heard_val = "0"
    else:
        podcast_heard_val = "1"

    Ilosc_przenosin = 0
    #Porzadek w katalogu "NIE/PRZESLUCHANE" - uwaga zmienia sie zaleznosc
    for root, dirs, files in os.walk(str(kat_przesluchane_)):
        for file in files:
            if re.search(r'^[0-3][0-9] - [0-9A-Za-z ]*.mp3$', file):
                caly_plik_sciezka = os.path.join(root, file)
                #lstrip zle dziala!!!
                kat_p2_ = caly_plik_sciezka.replace(kat_przesluchane_, "", 1)

                #kat_p2=re.search("([a-z A-Z0-9.-]*).([0-9 -]*).([0-3][0-9] - *)([0-9a-z A-Z]*)(\.mp3)",kat_p2_).groups()
                kat_p2 = re.search(
                    "([a-z A-Z0-9.-]*).([0-9 -]*).([0-3][0-9] - *)([0-9a-zA-Z ]*)",
                    kat_p2_).groups()

                kat_p2_name_aud_db = ""
                for i in audycje_link:
                    if audycje_link[i][1].lower() == kat_p2[0].strip().lower():
                        kat_p2_name_aud_db = i
                if not kat_p2_name_aud_db:
                    print("Blad w nazwie katalogu:", r'"' + kat_p2[0] + r'"')
                    exit()

                kat_p2_name_date_db = kat_p2[1] + "-" + kat_p2[2].rstrip("- ")
                kat_p2_NAME_DATE_DB = datetime.strptime(
                    kat_p2_name_date_db, "%Y - %m-%d").strftime("%d.%m.%Y")
                kat_p2_NAME_POD_DB = kat_p2[3].replace(" ", "-")
                rok_miesiac = datetime.strptime(kat_p2_NAME_DATE_DB,
                                                "%d.%m.%Y").strftime("%Y - %m")
                kat_nieprzesluchane_audycja = kat_nieprzesluchane_ + audycje_link[
                    kat_p2_name_aud_db][1]
                kat_nieprzesluchane_audycja_data = kat_nieprzesluchane_audycja + SEP + rok_miesiac
                katalogi_nieprzesluchane = [
                    kat_nieprzesluchane_, kat_nieprzesluchane_audycja,
                    kat_nieprzesluchane_audycja_data
                ]

                if kat_p2_name_aud_db:
                    cur.execute("SELECT name_audition, date_podcast, name_podcast FROM tokfm WHERE name_audition LIKE "
                    +"'%"+kat_p2_name_aud_db+"%'"\
                    +" AND date_podcast LIKE "+"'%"+kat_p2_NAME_DATE_DB+"%'"\
                    +" AND name_podcast LIKE "+"'%"+kat_p2_NAME_POD_DB+"%'"\
                    +" AND podcast_heard = "+podcast_heard_val\
                    )
                rows = cur.fetchone()
                if rows:
                    for katalog in katalogi_nieprzesluchane:
                        filenameResult = PurePosixPath(katalog)
                        if not Path(filenameResult).exists():
                            Path(filenameResult).mkdir()
                    movefile(caly_plik_sciezka,
                             kat_nieprzesluchane_audycja_data)
                    print("Przenioslem \"" + file + "\" do " +
                          kat_nieprzesluchane_audycja_data + SEP)
                    Ilosc_przenosin += 1

    cur.close()
    conn.close()
    print("Ilosc przenosin:", Ilosc_przenosin)