Ejemplo n.º 1
0
def fyrd_estimate_pvals(psi, ase, n_reps, min_periods=None,
                        n_genes_per_job=100):
    import fyrd
    outs = {}
    jobs = []
    for i in range(0, len(psi), n_genes_per_job):
        jobs.append(fyrd.submit(estimate_all_pvals,
                                (psi.iloc[i:i+n_genes_per_job],
                                 ase, n_reps, min_periods),
                               **cluster_args))

    for i in tqdm(list(range(len(jobs)))):
        job = jobs[i]
        res = job.get()
        for ix in res.index:
            outs[ix] = res[ix]

    return pd.Series(outs)
Ejemplo n.º 2
0
def get_regions_parallel(positions, genome_file, base=0, count=7):
    """Return a list of regions surrounding a position.

    Will loop through each chromosome and search all positions in that
    chromosome in one batch. Lookup is serial per chromosome.

    Args:
        positions (dict):  Dictionary of {chrom->positons}
        genome_file (str): Location of a genome fasta file or directory of
                           files. If directory, file names must be
                           <chrom_name>.fa[.gz]. Gzipped OK.
        base (int):        Either 0 or 1, base of positions in your list
        count (int):       Distance + and - the position to extract

    Returns:
        dict: {chrom->{postion->sequence}}
    """
    outs = []
    for chrom in positions.keys():
        if os.path.isdir(genome_file):
            fa_file = get_fasta_file(genome_file, chrom)
        if not os.path.isfile(fa_file):
            raise FileNotFoundError('{} not found.'.format(genome_file))
        mins = int(len(positions[chrom]) / 2000) + 60
        time = str(_td(minutes=mins))
        outs.append(
            fyrd.submit(
                get_regions,
                ({
                    chrom: positions[chrom]
                }, fa_file, base, count),
                cores=1,
                mem='6GB',
                time=time,
            ))

    final = {}
    for out in outs:
        final.update(out.get())
    return final
Ejemplo n.º 3
0
def get_regions_parallel(positions, genome_file, base=0, count=7):
    """Return a list of regions surrounding a position.

    Will loop through each chromosome and search all positions in that
    chromosome in one batch. Lookup is serial per chromosome.

    Args:
        positions (dict):  Dictionary of {chrom->positons}
        genome_file (str): Location of a genome fasta file or directory of
                           files. If directory, file names must be
                           <chrom_name>.fa[.gz]. Gzipped OK.
        base (int):        Either 0 or 1, base of positions in your list
        count (int):       Distance + and - the position to extract

    Returns:
        dict: {chrom->{postion->sequence}}
    """
    outs = []
    for chrom in positions.keys():
        if os.path.isdir(genome_file):
            fa_file = get_fasta_file(genome_file, chrom)
        if not os.path.isfile(fa_file):
            raise FileNotFoundError('{} not found.'.format(genome_file))
        mins = int(len(positions[chrom])/2000)+60
        time = str(_td(minutes=mins))
        outs.append(
            fyrd.submit(
                get_regions,
                ({chrom: positions[chrom]}, fa_file, base, count),
                cores=1, mem='6GB', time=time,
            )
        )

    final = {}
    for out in outs:
        final.update(out.get())
    return final
Ejemplo n.º 4
0
def get_dinucleotides_parallel(positions,
                               genome_file,
                               base=0,
                               return_as='list'):
    """Return a list of all + and - strand dinucleotides around each position.

    Will loop through each chromosome and search all positions in that
    chromosome in one batch. Lookup is parallel per chromosome.

    Args:
        positions (dict):  Dictionary of {chrom->positons}
        genome_file (str): Location of a genome fasta file or directory of
                           files. If directory, file names must be
                           <chrom_name>.fa[.gz]. Gzipped OK. Directory is
                           preferred in parallel mode.
        base (int):        Either 0 or 1, base of positions in your list
        return_as (str):   dict: Return a dictionary of:
                           {chrom->{postion->{'ref': str, '+': tuple, '-': tuple}}}
                           list: just returns two lists with no positions.
                           df: return DataFrame

    Returns:
        (list, list): + strand dinucleotides, - strand dinucleotides. Returns
                      a dict or instead if requested through return_as.
    """
    outs = []
    for chrom in positions.keys():
        if os.path.isdir(genome_file):
            fa_file = get_fasta_file(genome_file, chrom)
        if not os.path.isfile(fa_file):
            raise FileNotFoundError('{} not found.'.format(genome_file))
        mins = int(len(positions[chrom]) / 2000) + 45
        time = str(_td(minutes=mins))
        outs.append(
            fyrd.submit(
                get_dinucleotides,
                ({
                    chrom: positions[chrom]
                }, fa_file, base, return_as),
                cores=1,
                mem='6GB',
                time=time,
            ))

    if return_as == 'df':
        final = []
    elif return_as == 'dict':
        final = {}
    else:
        final = ([], [])

    fyrd.wait(outs)
    print('Getting results')
    for out in outs:
        res = out.get()
        if return_as == 'df':
            if isinstance(res, dict):
                res = dict_to_df(res, base)
            final.append(res)
        elif return_as == 'dict':
            final.update(res)
        else:
            plus, minus = res
            final[0] += plus
            final[1] += minus

    if return_as == 'df':
        print('Joining dataframe')
        final = pd.concat(final)

    return final
Ejemplo n.º 5
0
def run_depict_permutation(sample_1, sample_2, prefix, cores=None, perms=100,
                           run_path=None, depict_path=DEPICT,
                           perm_start=None, **fyrd_args):
    """Run DEPICT repeatedly and return locations of output files.

    This function uses fyrd to submit cluster jobs, jobs will request 2*cores
    to run, and 12G of memory.

    Takes 20 minutes to run 2 permutations on a small cluster.

    Args:
        sample_1 (str):    File name or path to file with rsids for sample 1
        sample_2 (str):    File name or path to file with rsids for sample 2
        prefix (str):      Name for the output directory, input file names will
                           be used to set output files in this directory.
        cores (int):       Number of cores to use *PER PROCESS* for DEPICT,
                           defaults to 1/2 of available cores on the machine,
                           meaning all cores will be used for run (1/2 each).
        perms (int):       Number of permutations.
        run_path (str):    Root directory to run in, defaults to current dir
        depict_path (str): Path to the DEPICT package, default set in file.
        perm_start (int):  Number to start permutations from
        fyrd_args (dict):  Fyrd keyword arguments, not required.

    Outputs:
        <prefix>/<sample_name>.geneprioritization.txt
        <prefix>/<sample_name>.loci.txt
        <prefix>/<sample_name>.tissueenrichment.txt
        <prefix>/<sample_name>.genesetenrichment.txt
        <prefix>/<sample_name>.log

    Returns:
        None: on success, else raises Exception if job fails.
    """
    if not cores:
        cores = PARAM_NCORES
    check_depict(depict_path)
    run_path = run_path if run_path else _os.path.abspath('.')
    selfpath = _os.path.realpath(__file__)
    if not perm_start:
        counts = []
        for fl in _os.listdir(run_path):
            if fl.startswith('{}_perm_'.format(prefix)):
                c = fl.split('_')[-1]
                if c.isdigit():
                    counts.append(int(c))
        if counts:
            perm_start = max(counts) + 1
        else:
            perm_start = 1
    if not isinstance(perm_start, int):
        perm_start = 1
    print('Starting permutation count at {}'.format(perm_start))
    s1_rsids = []
    s2_rsids = []
    with open(sample_1) as fin:
        s1_rsids += fin.read().strip().split('\n')
    with open(sample_2) as fin:
        s2_rsids += fin.read().strip().split('\n')
    rsids = np.array(s1_rsids + s2_rsids)
    jobs  = {}
    count = perm_start
    print('Running {} permutations'.format(perms))
    imports = ["import permute_depict as depict",
               "from permute_depict import *"]
    ttl = perms
    pbar = pb(total=ttl, unit='perms')
    while perms:
        this_perm = np.random.permutation(rsids)
        new_sample_1_data = sorted(this_perm[:len(s1_rsids)])
        new_sample_2_data = sorted(this_perm[len(s1_rsids):])
        assert len(new_sample_1_data) == len(s1_rsids)
        assert len(new_sample_2_data) == len(s2_rsids)
        perm_path = _pth(run_path, 'perm_files')
        if not _os.path.isdir(perm_path):
            _os.mkdir(perm_path)
        new_sample_1 = _pth(
            _os.path.abspath(perm_path),
            _os.path.basename(sample_1) + '_perm_{}.txt'.format(count)
        )
        new_sample_2 = _pth(
            _os.path.abspath(perm_path),
            _os.path.basename(sample_2) + '_perm_{}.txt'.format(count)
        )
        with open(new_sample_1, 'w') as fout:
            fout.write('\n'.join(new_sample_1_data))
        with open(new_sample_2, 'w') as fout:
            fout.write('\n'.join(new_sample_2_data))
        new_prefix = '{}_perm_{}'.format(prefix, count)
        job_path = _pth(run_path, 'jobs')
        if not _os.path.isdir(job_path):
            _os.mkdir(job_path)
        jobs['perm_{}'.format(count)] = (
            _fyrd.submit(
                run_depict,
                kwargs  = dict(sample_1    = new_sample_1,
                               sample_2    = new_sample_2,
                               prefix      = new_prefix,
                               cores       = cores,
                               run_path    = run_path,
                               depict_path = depict_path),
                name    = new_prefix,
                imports = ['import os as _os',
                           'from os.path import join as _pth',
                           'from subprocess import check_call as _call',
                           'import permute_depict as depict',
                           'from permute_depict import *'],
                cores   = cores*2,
                mem     = '12GB',
                scriptpath  = job_path,
                outpath     = job_path,
                runpath     = run_path,
                syspaths    = selfpath,
                **fyrd_args
            )
        )
        perms -= 1
        count += 1
        pbar.update()
    pbar.close()

    # Get output file information
    print('Permutation jobs submitted, waiting for results.')
    outputs = {}
    with pb(total=ttl, unit='results') as pbar:
        while len(outputs) < len(jobs):
            for name, job in jobs.items():
                if name in outputs:
                    continue
                job.update()
                if job.done:
                    outs = job.get()
                    outputs[name] = outs
                    with open(_pth(run_path, name) + '.files.dict', 'wb') as fout:
                        try:
                            _pickle.dump(outs, fout)
                        except TypeError:
                            pass
                    pbar.update()
            _sleep(1)

    print('Permutation jobs completed.')

    return outputs
Ejemplo n.º 6
0
def main():
    """Run core functionality."""

    # Our PID
    us = os.getpid()

    def exit_us(code=1):
        """Exit with code and delete lockfile if PID is us or dead."""
        if os.path.isfile(LOCK_FILE):
            with open(LOCK_FILE) as ffin:
                ppid = int(ffin.read().strip())
            if ppid == us or not check_pid(ppid):
                os.remove(LOCK_FILE)
        sys.exit(code)

    # Run exit_us at every exit (other than SIGKILL)
    atexit.register(exit_us)

    # Check we aren't already running
    if os.path.isfile(LOCK_FILE):
        with open(LOCK_FILE) as fin:
            pid = int(fin.read().strip())
        if check_pid(pid) and not pid == us:
            sys.exit(0)
        else:
            os.remove(LOCK_FILE)

    # Lock the script
    with open(LOCK_FILE, 'w') as fout:
        fout.write(str(us))

    # Time handling
    FMT = '%y%m%d-%H:%M:%S'
    NOW = dt.now()

    # Try to load old job data
    if os.path.isfile(DATA_FILE):
        with open(DATA_FILE) as fin:
            last_job = json.load(fin)
    else:
        last_job = None

    # Decide if we want to run again
    # Don't run is less than 12 hours or if old jobs are running
    if last_job:
        if (NOW-dt.strptime(last_job['time'], FMT)).seconds < WAIT_TIME:
            exit_us(0)

    # Only import fyrd when we have to as it can be slow
    sys.path.insert(0, PYTHON_LIB)
    import fyrd

    if last_job:
        queue = fyrd.queue.Queue('self')
        open_jobs = [queue[i] for i in last_job['jobs'] if str(i) in queue.jobs]
        if open_jobs:
            for job in open_jobs:
                if job.state in fyrd.queue.ACTIVE_STATES:
                    exit_us(0)

    # Create a temp dir just in case
    if not os.path.isdir(TEMP):
        os.makedirs(TEMP)

    # Clean up that dir
    fyrd.clean_dir(TEMP, confirm=False)
    os.system('rm {}/reset_perms* 2>/dev/null'.format(TEMP))
    os.system('rm {}/touch_pi_scratch* 2>/dev/null'.format(TEMP))

    # Get scripts
    scripts = [
        os.path.join(SCRPT_PATH, i) for i in ['reset_perms.sh', 'touch_pi_scratch.sh']
    ]

    # Submit them
    sub_jobs = []
    for script in scripts:
        sub_jobs.append(
            fyrd.submit(
                'bash ' + script + ' >/dev/null 2>/dev/null',
                partition='hbfraser,hns,normal', cores=1, mem=4000,
                time='18:00:00', outfile='/dev/null', errfile='/dev/null',
                scriptpath=TEMP, runpath=TEMP, clean_files=True,
                clean_outputs=True, name=script.split('/')[-1].split('.')[0]
            )
        )

    # Convert jobs into ids only
    job_ids = []
    for job in sub_jobs:
        job.update()
        job_ids.append(job.id)

    # Write out data
    job_data = {'time': NOW.strftime(FMT), 'jobs': job_ids}
    with open(DATA_FILE, 'w') as fout:
        json.dump(job_data, fout)

    # Done, force delete file
    os.remove(LOCK_FILE)
    return 0
Ejemplo n.º 7
0
def run_depict_permutation(sample_1,
                           sample_2,
                           prefix,
                           cores=None,
                           perms=100,
                           run_path=None,
                           depict_path=DEPICT,
                           perm_start=None,
                           **fyrd_args):
    """Run DEPICT repeatedly and return locations of output files.

    This function uses fyrd to submit cluster jobs, jobs will request 2*cores
    to run, and 12G of memory.

    Takes 20 minutes to run 2 permutations on a small cluster.

    Args:
        sample_1 (str):    File name or path to file with rsids for sample 1
        sample_2 (str):    File name or path to file with rsids for sample 2
        prefix (str):      Name for the output directory, input file names will
                           be used to set output files in this directory.
        cores (int):       Number of cores to use *PER PROCESS* for DEPICT,
                           defaults to 1/2 of available cores on the machine,
                           meaning all cores will be used for run (1/2 each).
        perms (int):       Number of permutations.
        run_path (str):    Root directory to run in, defaults to current dir
        depict_path (str): Path to the DEPICT package, default set in file.
        perm_start (int):  Number to start permutations from
        fyrd_args (dict):  Fyrd keyword arguments, not required.

    Outputs:
        <prefix>/<sample_name>.geneprioritization.txt
        <prefix>/<sample_name>.loci.txt
        <prefix>/<sample_name>.tissueenrichment.txt
        <prefix>/<sample_name>.genesetenrichment.txt
        <prefix>/<sample_name>.log

    Returns:
        None: on success, else raises Exception if job fails.
    """
    if not cores:
        cores = PARAM_NCORES
    check_depict(depict_path)
    run_path = run_path if run_path else _os.path.abspath('.')
    selfpath = _os.path.realpath(__file__)
    if not perm_start:
        counts = []
        for fl in _os.listdir(run_path):
            if fl.startswith('{}_perm_'.format(prefix)):
                c = fl.split('_')[-1]
                if c.isdigit():
                    counts.append(int(c))
        if counts:
            perm_start = max(counts) + 1
        else:
            perm_start = 1
    if not isinstance(perm_start, int):
        perm_start = 1
    print('Starting permutation count at {}'.format(perm_start))
    s1_rsids = []
    s2_rsids = []
    with open(sample_1) as fin:
        s1_rsids += fin.read().strip().split('\n')
    with open(sample_2) as fin:
        s2_rsids += fin.read().strip().split('\n')
    rsids = np.array(s1_rsids + s2_rsids)
    jobs = {}
    count = perm_start
    print('Running {} permutations'.format(perms))
    imports = [
        "import permute_depict as depict", "from permute_depict import *"
    ]
    ttl = perms
    pbar = pb(total=ttl, unit='perms')
    while perms:
        this_perm = np.random.permutation(rsids)
        new_sample_1_data = sorted(this_perm[:len(s1_rsids)])
        new_sample_2_data = sorted(this_perm[len(s1_rsids):])
        assert len(new_sample_1_data) == len(s1_rsids)
        assert len(new_sample_2_data) == len(s2_rsids)
        perm_path = _pth(run_path, 'perm_files')
        if not _os.path.isdir(perm_path):
            _os.mkdir(perm_path)
        new_sample_1 = _pth(
            _os.path.abspath(perm_path),
            _os.path.basename(sample_1) + '_perm_{}.txt'.format(count))
        new_sample_2 = _pth(
            _os.path.abspath(perm_path),
            _os.path.basename(sample_2) + '_perm_{}.txt'.format(count))
        with open(new_sample_1, 'w') as fout:
            fout.write('\n'.join(new_sample_1_data))
        with open(new_sample_2, 'w') as fout:
            fout.write('\n'.join(new_sample_2_data))
        new_prefix = '{}_perm_{}'.format(prefix, count)
        job_path = _pth(run_path, 'jobs')
        if not _os.path.isdir(job_path):
            _os.mkdir(job_path)
        jobs['perm_{}'.format(count)] = (_fyrd.submit(
            run_depict,
            kwargs=dict(sample_1=new_sample_1,
                        sample_2=new_sample_2,
                        prefix=new_prefix,
                        cores=cores,
                        run_path=run_path,
                        depict_path=depict_path),
            name=new_prefix,
            imports=[
                'import os as _os', 'from os.path import join as _pth',
                'from subprocess import check_call as _call',
                'import permute_depict as depict',
                'from permute_depict import *'
            ],
            cores=cores * 2,
            mem='12GB',
            scriptpath=job_path,
            outpath=job_path,
            runpath=run_path,
            syspaths=selfpath,
            **fyrd_args))
        perms -= 1
        count += 1
        pbar.update()
    pbar.close()

    # Get output file information
    print('Permutation jobs submitted, waiting for results.')
    outputs = {}
    with pb(total=ttl, unit='results') as pbar:
        while len(outputs) < len(jobs):
            for name, job in jobs.items():
                if name in outputs:
                    continue
                job.update()
                if job.done:
                    outs = job.get()
                    outputs[name] = outs
                    with open(_pth(run_path, name) + '.files.dict',
                              'wb') as fout:
                        try:
                            _pickle.dump(outs, fout)
                        except TypeError:
                            pass
                    pbar.update()
            _sleep(1)

    print('Permutation jobs completed.')

    return outputs
Ejemplo n.º 8
0
def get_dinucleotides_parallel(positions, genome_file, base=0, return_as='list'):
    """Return a list of all + and - strand dinucleotides around each position.

    Will loop through each chromosome and search all positions in that
    chromosome in one batch. Lookup is parallel per chromosome.

    Args:
        positions (dict):  Dictionary of {chrom->positons}
        genome_file (str): Location of a genome fasta file or directory of
                           files. If directory, file names must be
                           <chrom_name>.fa[.gz]. Gzipped OK. Directory is
                           preferred in parallel mode.
        base (int):        Either 0 or 1, base of positions in your list
        return_as (str):   dict: Return a dictionary of:
                           {chrom->{postion->{'ref': str, '+': tuple, '-': tuple}}}
                           list: just returns two lists with no positions.
                           df: return DataFrame

    Returns:
        (list, list): + strand dinucleotides, - strand dinucleotides. Returns
                      a dict or instead if requested through return_as.
    """
    outs = []
    for chrom in positions.keys():
        if os.path.isdir(genome_file):
            fa_file = get_fasta_file(genome_file, chrom)
        if not os.path.isfile(fa_file):
            raise FileNotFoundError('{} not found.'.format(genome_file))
        mins = int(len(positions[chrom])/2000)+45
        time = str(_td(minutes=mins))
        outs.append(
            fyrd.submit(
                get_dinucleotides,
                ({chrom: positions[chrom]}, fa_file, base, return_as),
                cores=1, mem='6GB', time=time,
            )
        )

    if return_as == 'df':
        final = []
    elif return_as == 'dict':
        final = {}
    else:
        final = ([], [])

    fyrd.wait(outs)
    print('Getting results')
    for out in outs:
        res = out.get()
        if return_as == 'df':
            if isinstance(res, dict):
                res = dict_to_df(res, base)
            final.append(res)
        elif return_as == 'dict':
            final.update(res)
        else:
            plus, minus = res
            final[0] += plus
            final[1] += minus

    if return_as == 'df':
        print('Joining dataframe')
        final = pd.concat(final)

    return final
Ejemplo n.º 9
0
def main():
    """Run core functionality."""

    # Our PID
    us = os.getpid()

    def exit_us(code=1):
        """Exit with code and delete lockfile if PID is us or dead."""
        if os.path.isfile(LOCK_FILE):
            with open(LOCK_FILE) as ffin:
                ppid = int(ffin.read().strip())
            if ppid == us or not check_pid(ppid):
                os.remove(LOCK_FILE)
        sys.exit(code)

    # Run exit_us at every exit (other than SIGKILL)
    atexit.register(exit_us)

    # Check we aren't already running
    if os.path.isfile(LOCK_FILE):
        with open(LOCK_FILE) as fin:
            pid = int(fin.read().strip())
        if check_pid(pid) and not pid == us:
            sys.exit(0)
        else:
            os.remove(LOCK_FILE)

    # Lock the script
    with open(LOCK_FILE, 'w') as fout:
        fout.write(str(us))

    # Time handling
    FMT = '%y%m%d-%H:%M:%S'
    NOW = dt.now()

    # Try to load old job data
    if os.path.isfile(DATA_FILE):
        with open(DATA_FILE) as fin:
            last_job = json.load(fin)
    else:
        last_job = None

    # Decide if we want to run again
    # Don't run is less than 12 hours or if old jobs are running
    if last_job:
        if (NOW - dt.strptime(last_job['time'], FMT)).seconds < WAIT_TIME:
            exit_us(0)

    # Only import fyrd when we have to as it can be slow
    sys.path.insert(0, PYTHON_LIB)
    import fyrd

    if last_job:
        queue = fyrd.queue.Queue('self')
        open_jobs = [
            queue[i] for i in last_job['jobs'] if str(i) in queue.jobs
        ]
        if open_jobs:
            for job in open_jobs:
                if job.state in fyrd.queue.ACTIVE_STATES:
                    exit_us(0)

    # Create a temp dir just in case
    if not os.path.isdir(TEMP):
        os.makedirs(TEMP)

    # Clean up that dir
    fyrd.clean_dir(TEMP, confirm=False)
    os.system('rm {}/reset_perms* 2>/dev/null'.format(TEMP))
    os.system('rm {}/touch_pi_scratch* 2>/dev/null'.format(TEMP))

    # Get scripts
    scripts = [
        os.path.join(SCRPT_PATH, i)
        for i in ['reset_perms.sh', 'touch_pi_scratch.sh']
    ]

    # Submit them
    sub_jobs = []
    for script in scripts:
        sub_jobs.append(
            fyrd.submit('bash ' + script + ' >/dev/null 2>/dev/null',
                        partition='hbfraser,hns,normal',
                        cores=1,
                        mem=4000,
                        time='18:00:00',
                        outfile='/dev/null',
                        errfile='/dev/null',
                        scriptpath=TEMP,
                        runpath=TEMP,
                        clean_files=True,
                        clean_outputs=True,
                        name=script.split('/')[-1].split('.')[0]))

    # Convert jobs into ids only
    job_ids = []
    for job in sub_jobs:
        job.update()
        job_ids.append(job.id)

    # Write out data
    job_data = {'time': NOW.strftime(FMT), 'jobs': job_ids}
    with open(DATA_FILE, 'w') as fout:
        json.dump(job_data, fout)

    # Done, force delete file
    os.remove(LOCK_FILE)
    return 0