def finalize(): if not hasattr(GC,'final_sequences'): # GC.final_sequences[cn_node][t] = set of (label,seq) tuples GC.final_sequences = {} if GC.errorfree_sequence_file.lower().endswith('.gz'): from gzip import open as gopen lines = [l.decode().strip() for l in gopen(GC.errorfree_sequence_file)] else: lines = [l.strip() for l in open(GC.errorfree_sequence_file)] lines = [l for l in lines if len(l) != 0] if len(lines) == 0: return seqs = GC.parseFASTA(lines) for ID,seq in seqs.items(): v,n,t = ID.split('|'); t = float(t) if v == 'DUMMY': continue if n not in GC.final_sequences: GC.final_sequences[n] = {} if t not in GC.final_sequences[n]: GC.final_sequences[n][t] = [] GC.final_sequences[n][t].append((v,seq))
def get_edge_list(): args = [] for arg in PANGEA_ARGS: val = getattr(GC, arg) if isinstance(val, str): val = val.strip() if len(val) == 0 and arg.split('_')[1].strip( ) == 'seed': # if no seed given, randomly generate from random import randint val = str(randint(0, 32767)) if len(val) != 0: args.append("%s='%s'" % (arg.split('_')[1], val)) else: args.append("%s=%s" % (arg.split('_')[1], val)) orig_dir = getcwd() makedirs(PANGEA_path, exist_ok=True) chdir(PANGEA_path) f = open(PANGEA_command_script, 'w') f.write("library(PANGEA.HIV.sim)\n") f.write("outdir <- getwd()\n") f.write("pipeline.args <- sim.regional.args(") if GC.random_number_seed is not None: f.write("seed=%d," % GC.random_number_seed) GC.random_number_seed += 1 f.write(', '.join(args)) f.write(")\ncat(sim.regional(outdir, pipeline.args=pipeline.args))") f.close() try: check_output([GC.Rscript_path, PANGEA_command_script], stderr=open(devnull, 'w')) except FileNotFoundError: chdir(GC.START_DIR) assert False, "Rscript executable was not found" for script in glob('*.sh'): break script_str = open(script, 'r').read() f = open(script, 'w') f.write('#!/usr/bin/env bash\n%s' % script_str) f.close() check_output(['./%s' % script], stderr=open('%s_output.log' % script, 'w')) archive = None for archive in glob('*_INTERNAL.zip'): break assert archive is not None, "PANGEA failed to run successfully" z = ZipFile(archive, 'r') internal = [ item for item in z.namelist() if item.endswith('_SIMULATED_INTERNAL.R') ][0] f = open(internal, 'wb') f.write(z.read(internal)) f.close() f = open(PANGEA_trans_net_script, 'w') f.write("library(PANGEA.HIV.sim)\n") f.write("load('%s')\n" % internal) f.write("trans <- df.trms[,c('IDTR','IDREC','TIME_TR')]\n") f.write( "write.table(trans[order(trans$TIME_TR),], file='%s', append=FALSE, sep='\\t', row.names=FALSE, col.names=FALSE, quote=FALSE)" % PANGEA_trans_file) f.close() check_output([GC.Rscript_path, PANGEA_trans_net_script], stderr=open(devnull, 'w')) GC.PANGEA_TRANSMISSION_NETWORK = [ i.strip().split() for i in open(PANGEA_trans_file) if len(i.strip()) > 0 ] chdir(orig_dir) for archive in glob('%s/*_SIMULATED_SEQ.zip' % PANGEA_path): break z = ZipFile(archive, 'r') fasta_files = [item for item in z.namelist() if item.endswith('.fa')] f = gopen("error_free_files/sequence_data.fasta.gz", 'wb', 9) for fasta in fasta_files: ending = '_%s.fasta' % fasta.split('_')[-1].split('.')[0] seqs = GC.parseFASTA(z.read(fasta).decode('ascii').splitlines()) for seqID in seqs: f.write(('>%s\n%s\n' % (seqID, seqs[seqID])).encode()) f.write(b'\n') f.close() archive = None for archive in glob('%s/*_SIMULATED_TREE.zip' % PANGEA_path): break assert archive is not None, "PANGEA failed to run successfully" z = ZipFile(archive, 'r') trees = [item for item in z.namelist() if item.endswith('.newick')] for tree in trees: f = gopen("error_free_files/phylogenetic_trees/%s.gz" % tree, 'wb', 9) to_write = z.read(tree) if isinstance(to_write, bytes): f.write(to_write) else: f.write(to_write.encode()) f.write(b'\n') f.close() return []