Ejemplo n.º 1
0
def run(SelectedSGBs, configFile):
    config = configparser.ConfigParser(
        interpolation=configparser.ExtendedInterpolation())
    config.read(configFile)
    build_representatives = config['build_representatives']
    print("Initializing")
    basedir = build_representatives['qp_base_dir']
    if not os.path.exists(basedir):
        os.makedirs(basedir)
    if not os.path.exists(build_representatives['output_cores_dir']):
        os.makedirs(build_representatives['output_cores_dir'])
    sethandlers()
    os.chdir(basedir)
    print("Starting")
    print(time.ctime())
    with fakeqp(jobname='build', q=['himem7.q']) as q:
        q.startpermanentrun()
        waiton = []
        chunk_size = eval(build_representatives['chunksize'])
        # for chunk in range(0,len(SelectedSGBs), chunk_size):
        #     waiton.append(q.method(renameCore, (SelectedSGBs[chunk:chunk+chunk_size],
        #                                  build_representatives['output_cores_dir'],
        #                                  build_representatives['genomes_dir'])))
        # q.wait(waiton)
        waiton = [
            q.method(buildByCore,
                     (SelectedSGBs, build_representatives['output_fasta'],
                      build_representatives['output_cores_dir']))
        ]
        q.wait(waiton)
    print(time.ctime())
Ejemplo n.º 2
0
def run(job_iterator: Iterable[JobInfo],
        data_iterator: Callable,
        xy_function: Callable,
        output_dir: str,
        use_fakeqp=False,
        qp_kwargs: Dict = None) -> pd.DataFrame:
    """Creates a job for each item in job_iterator and collects the results."""
    sethandlers()
    if qp_kwargs is None:
        qp_kwargs = {}

    qprovider = qp if not use_fakeqp else fakeqp

    with qprovider(**qp_kwargs) as q:
        q.startpermanentrun()

        tkttores = []
        for job_info in job_iterator:
            tkttores.append(
                q.method(_run_per_job,
                         (job_info, data_iterator, xy_function, output_dir),
                         _job_name=job_info.name))

        fnames = []
        for r in tkttores:
            fnames.append(q.waitforresult(r))

    result = pd.concat((pd.read_hdf(f) for f in fnames), ignore_index=True)
    for f in fnames:
        os.remove(f)

    return result
Ejemplo n.º 3
0
def runOnSGBs(configFile):
    config = configparser.ConfigParser(interpolation=configparser.ExtendedInterpolation())
    config.read(configFile)
    run_pipeline = config['run_pipeline']
    if not os.path.exists(run_pipeline['representatives']):
            EatOrKeepSmallRepresentatives.run(configFile)
    SelectedSGBs=getAllSGBs(run_pipeline['representatives'],
               run_pipeline['genomes_dir'],
               run_pipeline['all_large_or_new_sgbs'])
    if not os.path.exists(run_pipeline['stage1output']):
        print ("Making representatives fasta", time.ctime())
        buildRepresentatives.run(SelectedSGBs,configFile)
        print ("Bulding Bowtie index", time.ctime())
        build_big_bowtie.run(configFile)
        with open(run_pipeline['stage1output'],'w') as donefile:
            donefile.write('Done\n')
    basedir = run_pipeline['qp_base_dir']
    score_output = run_pipeline['score_output']
    sethandlers()
    os.chdir(basedir)
    print ("Starting")
    with qp(jobname='build', q=['himem7.q']) as q:
        q.startpermanentrun()
        waiton = []
        chucksize=50
        count=0
        for chunkSGBsIDs in range(0,len(SelectedSGBs),chucksize):
            chunkSGBs=SelectedSGBs.loc[count*chucksize:chucksize*(count+1)-1]
            count+=1
            waiton.append(q.method(runChuckOfSGBs, (chunkSGBs, configFile)))
        q.wait(waiton)
        print ("Done running on %s SGBs"%len(waiton))
    print ("Done", time.ctime())
    return
Ejemplo n.º 4
0
import os
import mwas_annot
from LabQueue.qp import qp, fakeqp
from LabUtils.addloglevels import sethandlers

# parameters
body_site = 'Oral'  # TODO: don't forget to update majmin

output_dir = f'/net/mraid08/export/genie/LabData/Analyses/saarsh/PNP3_mwas/PNP3_mwas_{body_site.lower()}_0months_subtraction'
jobs_path = os.path.join(output_dir, 'jobs')
mwas_file_path = os.path.join(output_dir, f'mb_gwas_significant.h5')

# run
os.chdir(jobs_path)
sethandlers()

with qp(jobname=f'annot_{body_site}',
        _delete_csh_withnoerr=True,
        q=['himem7.q'],
        max_r=2,
        _mem_def='5G') as q:
    q.startpermanentrun()
    snps = q.method(mwas_annot.run, (mwas_file_path, output_dir, body_site))
    q.waitforresult(snps)
Ejemplo n.º 5
0
    min_reads_per_snp = 3

    min_common_positions = 20000
    min_positions_per_sample = 20000

    work_dir = os.path.join(config.analyses_dir, date2_dir())


def data_gen(loaders, subjects_df=None, **kwargs):
    from LabData.DataMergers.MultiDataLoader import MultiDataLoader
    accepts_subjects_df = all([l != 'SubjectLoader' for l in to_list(loaders)])
    return MultiDataLoader(loaders, subjects_df=subjects_df, **kwargs).get_data() if accepts_subjects_df \
        else MultiDataLoader(loaders, **kwargs).get_data()


def gen_pairwise_dists():
    write_members(os.path.join(P.work_dir, 'PARAMS.txt'), P)

    subjects_gen_f = lambda: data_gen(subjects_loaders, **
                                      subjects_get_data_args)

    from LabData.DataAnalyses.MBSNPs.MBSNPAnalyses import MBSNPPairwiseDistances
    MBSNPPairwiseDistances(**dict((key, value) for key, value in P.__dict__.items()
                                  if not key.startswith('__')))\
        .run(subjects_gen_f=subjects_gen_f, species_set=species_set)


if __name__ == '__main__':
    sethandlers(file_dir=config.log_dir)
    gen_pairwise_dists()
Ejemplo n.º 6
0
import os
import glob
from LabQueue.qp import qp
from LabUtils.addloglevels import sethandlers
from LabData.DataLoaders.MBSNPLoader import OralMBSNPLoader


def func():
    potential_species = glob.glob('/home/saarsh/Genie/LabData/Data/MBPipeline/Analyses/MBSNP/Oral/MAF/mb_snp_maf_SGB_*_R1_S100.h5')
    potential_species = ['SGB_' + s.split('_')[-3] for s in potential_species]
    done_species = glob.glob('/home/saarsh/Genie/LabData/Data/MBPipeline/Analyses/MBSNP/Oral/MAF/mb_snp_annot_maf_SGB_*_R1_S100.h5')
    done_species = ['SGB_' + s.split('_')[-3] for s in done_species]
    species = list(set(potential_species) - set(done_species))

    ld = OralMBSNPLoader()
    ld._gen_species_set_maf_annot_data(species, min_reads_per_snp=1, min_samples_per_snp_cached=100)
    # TODO: make sure the gene annotation loader is using the OralMBLoader and not the Gut


sethandlers(file_dir='/home/saarsh/Analysis/antibiotics/jobs/')
os.chdir('/home/saarsh/Analysis/antibiotics/jobs/')

with qp(jobname='annot', _delete_csh_withnoerr=True, q=['himem7.q']) as q:
    q.startpermanentrun()
    tkttores = {}
    tkttores[0] = q.method(func)
    for k, v in tkttores.items():
        q.waitforresult(v)
Ejemplo n.º 7
0
jobs_dir = '/net/mraid08/export/jafar/Microbiome/Analyses/saar/NLDcopmJobs'


def func(folder):
    cmd = f'rm -Rf {folder} &'
    print(cmd)
    _shell_command(cmd)

    # files = glob.glob(os.path.join(folder, '*'))
    # len_files = len(files)
    # for i_file, file in enumerate(files):
    #     if not os.path.isdir(file):
    #         print(f'file {i_file + 1}/{len_files}')
    #             _shell_command('gzip -9 ' + file)


# queue
os.chdir(jobs_dir)
sethandlers(file_dir=jobs_dir)

with qp(jobname='NLDcomp', _mem_def='10G', _tryrerun=False) as q:
    q.startpermanentrun()
    tkttores = {}

    for i_folder, folder in enumerate(folders):
        tkttores[i_folder] = q.method(func, [folder])

    for k, v in tkttores.items():
        q.waitforresult(v)
Ejemplo n.º 8
0
    col and 'SNPB' not in col for col in df.columns
]]
if df['RawReadLength'].unique() != read_len:
    print('all hell broke loose')
    1 / 0
df['cnt'] = True
df.to_pickle(os.path.join(base_dir, 'DFOut', 'PostUniteMetadata.df'))
print(f'Running on {df.shape[0]}')

# pipeline
Email = ' [email protected]'
General_params = f' --max_r {MAX_JOBS} --use_general_python '
Modules = ' --module_seq "MID,UZP,URB,SNB" '

os.chdir(os.path.join(base_dir, 'tmp2', 'jobs'))
logf, warnstream = sethandlers(logging.INFO, True, True, file_prefix='mmmbp_')

with config.qp(jobname=run_name,
               max_r=MAX_JOBS,
               q=['himem7.q'],
               _tryrerun=True,
               delay_batch=10) as q:
    q.startpermanentrun()

    MID_params = '--mid_md_path ' + os.path.join(base_dir, 'DFOut', 'PostUniteMetadata.df ') + \
                 '--mid_input_path ' + os.path.join(os.path.dirname(os.path.dirname(post)), 'tmp2', 'UNT', ' ') + \
                 '--mid_ext .fastq.gz ' + \
                 '--mid_check_cont '

    URB_params = f' --urb_num_mapped_to_subsample {urb_num_mapped_to_subsample} ' \
                 f' --urb_min_mapped_to_retain {urb_min_mapped_to_retain} ' \