def test_strip_drams(self): #run the whole thing and make sure it completed via the status file results = strip_drams(self.fn, 'lewis') self.assertTrue(results.__next__()[1][120:123] == '---') self.assertTrue(results.__next__()[1][672:675] == '---') self.assertFalse(results.__next__()[1][687:690] == '---') results = strip_drams(self.fn, 'wheeler') self.assertTrue(results.__next__()[1][129:132] == '---') self.assertTrue(results.__next__()[1][687:690] == '---') results = hivtrace.hivtrace(id, self.fn, self.reference, self.ambiguities, self.distance_threshold, self.min_overlap, False, '0.025', 'lewis') self.assertTrue(results["trace_results"]) return
def hivtrace(id, input, reference, ambiguities, threshold, min_overlap, compare_to_lanl, fraction, strip_drams_flag = False, filter_edges = "no", handle_contaminants = "remove", skip_alignment = False, attributes_file = None): """ PHASE 1) Pad sequence alignment to HXB2 length with bealign PHASE 2) Convert resulting bam file back to FASTA format PHASE 2b) Rename any duplicates in FASTA file PHASE 3) Strip DRAMs if requested PHASE 3b) Filtering contaminants before TN93 run if requested PHASE 4) TN93 analysis on the supplied FASTA file alone PHASE 5) Run hivclustercsv to return clustering information in JSON format PHASE 5b) Attribute annotations to results from (4) PHASE 6) Run tn93 against LANL if user elects to PHASE 6b) Concatenate results from pre-run LANL tn93, user tn93, and (5) analyses PHASE 6c) Flag any potential HXB2 sequences PHASE 7) Run hivclustercsv to return clustering information in json format """ results_json = {} # Declare reference file resource_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'rsrc') # These should be defined in the user's environment env_dir = os.path.dirname(sys.executable) PYTHON = sys.executable # Try python's system executable first, then the user's path. if(os.path.isfile(os.path.join(env_dir, 'bealign'))): BEALIGN = os.path.join(env_dir, 'bealign') else: BEALIGN = 'bealign' if(os.path.isfile(os.path.join(env_dir, 'bam2msa'))): BAM2MSA = os.path.join(env_dir, 'bam2msa') else: BAM2MSA = 'bam2msa' if(os.path.isfile(os.path.join(env_dir, 'hivnetworkcsv'))): HIVNETWORKCSV = os.path.join(env_dir, 'hivnetworkcsv') else: HIVNETWORKCSV = 'hivnetworkcsv' TN93DIST = 'tn93' # This will have to be another parameter LANL_FASTA = os.path.join(resource_dir, 'LANL.FASTA') LANL_TN93OUTPUT_CSV = os.path.join(resource_dir, 'LANL.TN93OUTPUT.csv') DEFAULT_DELIMITER = '|' # Check if LANL files exists. If not, then check if zip file exists, # otherwise throw error try: if not os.path.isfile(LANL_FASTA): lanl_zip = os.path.join(resource_dir, 'LANL.FASTA.gz') gunzip_file(lanl_zip, LANL_FASTA) if not os.path.isfile(LANL_TN93OUTPUT_CSV): lanl_tn93output_zip = os.path.join(resource_dir, 'LANL.TN93OUTPUT.csv.gz') gunzip_file(lanl_tn93output_zip, LANL_TN93OUTPUT_CSV) except e: # pragma: no cover print("Oops, missing a resource file") raise # Python Parameters SCORE_MATRIX='HIV_BETWEEN_F' OUTPUT_FORMAT='csv' SEQUENCE_ID_FORMAT='plain' # Intermediate filenames tmp_path = tempfile.mkdtemp(prefix='hivtrace-') basename = os.path.basename(input) BAM_FN = os.path.join(tmp_path, basename+'_output.bam') OUTPUT_FASTA_FN = input+'_output.fasta' OUTPUT_TN93_FN = os.path.join(tmp_path, basename+'_user.tn93output.csv') OUTPUT_TN93_CONTAM_FN = os.path.join(tmp_path, basename+'_contam.tn93output.csv') DEST_TN93_FN = input+'_user.tn93output.csv' JSON_TN93_FN = os.path.join(tmp_path, basename+'_user.tn93output.json') JSON_TN93_CONTAM_FN = os.path.join(tmp_path, basename+'_contam.tn93output.json') OUTPUT_COMBINED_SEQUENCE_FILE = os.path.join(tmp_path, basename+"_combined_user_lanl.fasta") OUTPUT_CLUSTER_JSON = os.path.join(tmp_path, basename+'_user.trace.json') LANL_OUTPUT_CLUSTER_JSON = os.path.join(tmp_path, basename+'_lanl_user.trace.json') OUTPUT_USERTOLANL_TN93_FN = os.path.join(tmp_path, basename+'_usertolanl.tn93output.csv') USER_LANL_TN93OUTPUT = os.path.join(tmp_path, basename+'_userlanl.tn93output.csv') USER_FILTER_LIST = os.path.join(tmp_path, basename+'_user_filter.csv') CONTAMINANT_ID_LIST = os.path.join(tmp_path, basename+'_contaminants.csv') # File handler for output we don't care about DEVNULL = open(os.devnull, 'w') EXCLUSION_LIST = None # Check for incompatible statements if skip_alignment and compare_to_lanl: raise Exception("You have passed arguments that are incompatible! You cannot compare to the public database if you elect to submit a pre-made alignment! Please consider the issue before trying again.") if skip_alignment: # Check for equal length in all sequences seqs = fasta_iter(input) seq_length = len(seqs.__next__()[1]) if(any(len(seq[1]) != seq_length for seq in seqs)): raise Exception("Not all input sequences have the same length!") # copy input file to output fasta file shutil.copyfile(input, OUTPUT_FASTA_FN) else: # PHASE 1 update_status(id, phases.ALIGNING, status.RUNNING) if handle_contaminants is None: handle_contaminants = 'no' bealign_process = [BEALIGN, '-q', '-r', reference , '-m', SCORE_MATRIX, '-R', input, BAM_FN] if handle_contaminants != 'no': bealign_process.insert (-3, '-K') logging.debug(' '.join(bealign_process)) subprocess.check_call(bealign_process, stdout=DEVNULL) update_status(id, phases.ALIGNING, status.COMPLETED) # PHASE 2 update_status(id, phases.BAM_FASTA_CONVERSION, status.RUNNING) bam_process = [BAM2MSA, BAM_FN, OUTPUT_FASTA_FN] logging.debug(' '.join(bam_process)) subprocess.check_call(bam_process, stdout=DEVNULL) update_status(id, phases.BAM_FASTA_CONVERSION, status.COMPLETED) if handle_contaminants != 'no' and handle_contaminants !='separately': with (open (OUTPUT_FASTA_FN, 'r')) as msa: reference_name = next (SeqIO.parse (msa, 'fasta')).id logging.debug ('Reference name set to %s' % reference_name) with open (CONTAMINANT_ID_LIST, 'w') as contaminants: print (reference_name, file = contaminants) # Ensure unique ids # Warn of duplicates by annotating with an attribute rename_duplicates(OUTPUT_FASTA_FN, DEFAULT_DELIMITER) attribute_map = ('SOURCE', 'SUBTYPE', 'COUNTRY', 'ACCESSION_NUMBER', 'YEAR_OF_SAMPLING') # PHASE 3 # Strip DRAMS if strip_drams_flag: OUTPUT_FASTA_FN_TMP = OUTPUT_FASTA_FN + ".spool" with open (str(OUTPUT_FASTA_FN_TMP),'w') as output_file: for (seq_id, data) in sd.strip_drams (OUTPUT_FASTA_FN, strip_drams_flag): print (">%s\n%s" % (seq_id, data), file = output_file) shutil.move (OUTPUT_FASTA_FN_TMP, OUTPUT_FASTA_FN) # PHASE 3b Filter contaminants if handle_contaminants == 'separately': update_status(id, phases.FILTER_CONTAMINANTS, status.RUNNING) with open(JSON_TN93_CONTAM_FN, 'w') as tn93_contam_fh: tn93_contam_process = [ TN93DIST, '-q', '-o', OUTPUT_TN93_CONTAM_FN, '-t', '0.015', '-a', 'resolve', '-l', min_overlap, '-g', '1.0', '-s', reference, '-f', OUTPUT_FORMAT, OUTPUT_FASTA_FN ] logging.debug(' '.join(tn93_contam_process)) subprocess.check_call(tn93_contam_process,stdout=tn93_contam_fh,stderr=tn93_contam_fh) # shutil.copyfile(OUTPUT_TN93_FN, DEST_TN93_FN) update_status(id, phases.FILTER_CONTAMINANTS, status.COMPLETED) # Process output for contaminants and remove them from the file # Store the contaminants for reporting later with open(OUTPUT_TN93_CONTAM_FN, 'r') as tn93_contam_fh: tn93reader = csv.reader(tn93_contam_fh, delimiter=',', quotechar='|') tn93reader.__next__() contams = [row[0] for row in tn93reader] OUTPUT_FASTA_FN_TMP = OUTPUT_FASTA_FN + ".contam.tmp" # Remove contams from FASTA file with (open (OUTPUT_FASTA_FN, 'r')) as msa_fn: msa = SeqIO.parse (msa_fn, 'fasta') filtered_msa = filter(lambda x: x.id not in contams, msa) # Write to new TMP file with open(OUTPUT_FASTA_FN_TMP, "w") as output_handle: SeqIO.write(filtered_msa, output_handle, "fasta") shutil.move (OUTPUT_FASTA_FN_TMP, OUTPUT_FASTA_FN) # PHASE 4 update_status(id, phases.COMPUTE_TN93_DISTANCE, status.RUNNING) with open(JSON_TN93_FN, 'w') as tn93_fh: tn93_process = [TN93DIST, '-q', '-o', OUTPUT_TN93_FN, '-t', threshold, '-a', ambiguities, '-l', min_overlap, '-g', fraction if ambiguities == 'resolve' else '1.0', '-f', OUTPUT_FORMAT, OUTPUT_FASTA_FN] logging.debug(' '.join(tn93_process)) subprocess.check_call(tn93_process,stdout=tn93_fh,stderr=tn93_fh) shutil.copyfile(OUTPUT_TN93_FN, DEST_TN93_FN) update_status(id, phases.COMPUTE_TN93_DISTANCE, status.COMPLETED) # send contents of tn93 to status page id_dict = id_to_attributes(OUTPUT_TN93_FN, attribute_map, DEFAULT_DELIMITER) if type(id_dict) is ValueError: update_status(id, "Error: " + id_dict.args[0]) raise id_dict # PHASE 5 update_status(id, phases.INFERRING_NETWORK, status.RUNNING) output_cluster_json_fh = open(OUTPUT_CLUSTER_JSON, 'w') hivnetworkcsv_process = [HIVNETWORKCSV, '-i', OUTPUT_TN93_FN, '-t', threshold, '-f', SEQUENCE_ID_FORMAT, '-j', '-o'] if filter_edges and filter_edges != 'no': hivnetworkcsv_process.extend (['-n', filter_edges, '-s', OUTPUT_FASTA_FN]) if handle_contaminants == 'report' or handle_contaminants == 'remove': hivnetworkcsv_process.extend (['-C', handle_contaminants, '-F', CONTAMINANT_ID_LIST]) # hivclustercsv uses stderr for status updates complete_stderr = '' returncode = None logging.debug(' '.join(hivnetworkcsv_process)) with subprocess.Popen(hivnetworkcsv_process, stdout=output_cluster_json_fh, stderr=PIPE, bufsize=1, universal_newlines=True) as p: for line in p.stderr: complete_stderr += line update_status(id, phases.INFERRING_NETWORK, status.RUNNING, complete_stderr) p.wait() if p.returncode != 0: raise subprocess.CalledProcessError(returncode, ' '.join(hivnetworkcsv_process), complete_stderr) update_status(id, phases.INFERRING_NETWORK, status.COMPLETED, complete_stderr) output_cluster_json_fh.close() # Read and print output_cluster_json results_json["trace_results"] = json.loads(open(OUTPUT_CLUSTER_JSON, 'r').read()) # Get singletons singletons = get_singleton_nodes(results_json['trace_results']['Nodes'], input) results_json['trace_results']['Singletons'] = singletons # Place singleton count in Network Summary results_json['trace_results']['Network Summary']['Singletons'] = len(singletons) # Place contaminant nodes in Network Summary if handle_contaminants == 'separately': results_json['trace_results']['Network Summary']['contaminant_sequences'] = contams if attributes_file != None and attributes_file != False: annotate_file_attributes(results_json['trace_results'], attributes_file, 'ehars_uid') if not compare_to_lanl: return results_json if compare_to_lanl: # PHASE 6 update_status(id, phases.PUBLIC_COMPUTE_TN93_DISTANCE, status.RUNNING) lanl_tn93_process = '' if ambiguities != 'resolve': lanl_tn93_process = [TN93DIST, '-q', '-o', OUTPUT_USERTOLANL_TN93_FN, '-t', threshold, '-a', ambiguities, '-f', OUTPUT_FORMAT, '-l', min_overlap, '-s', LANL_FASTA, OUTPUT_FASTA_FN] else: lanl_tn93_process = [TN93DIST, '-q', '-o', OUTPUT_USERTOLANL_TN93_FN, '-t', threshold, '-a', ambiguities, '-f', OUTPUT_FORMAT, '-g', fraction, '-l', min_overlap, '-s', LANL_FASTA, OUTPUT_FASTA_FN] logging.debug(' '.join(lanl_tn93_process)) subprocess.check_call(lanl_tn93_process, stdout=DEVNULL) update_status(id, phases.PUBLIC_COMPUTE_TN93_DISTANCE, status.COMPLETED) # send contents of tn93 to status page # PHASE 6b # Perform concatenation # This is where reference annotation becomes an issue concatenate_data(USER_LANL_TN93OUTPUT, LANL_TN93OUTPUT_CSV, OUTPUT_USERTOLANL_TN93_FN, OUTPUT_TN93_FN) lanl_id_dict = id_to_attributes(OUTPUT_TN93_FN, attribute_map, DEFAULT_DELIMITER) # Create a list from TN93 csv for hivnetworkcsv filter create_filter_list(OUTPUT_TN93_FN, USER_FILTER_LIST) # PHASE 7 update_status(id,phases.PUBLIC_INFERRING_CONNECTIONS, status.RUNNING) lanl_output_cluster_json_fh = open(LANL_OUTPUT_CLUSTER_JSON, 'w') if filter_edges and filter_edges != 'no': with open (OUTPUT_COMBINED_SEQUENCE_FILE, 'w') as combined_fasta: for f_path in (LANL_FASTA, OUTPUT_FASTA_FN): with open (f_path) as src_file: shutil.copyfileobj (src_file,combined_fasta) print ("\n", file = combined_fasta) lanl_hivnetworkcsv_process = [PYTHON, HIVNETWORKCSV, '-i', USER_LANL_TN93OUTPUT, '-t', threshold, '-f', SEQUENCE_ID_FORMAT, '-j', '-k', USER_FILTER_LIST, '-n', filter_edges, '-s', OUTPUT_COMBINED_SEQUENCE_FILE ] else: lanl_hivnetworkcsv_process = [PYTHON, HIVNETWORKCSV, '-i', USER_LANL_TN93OUTPUT, '-t', threshold, '-f', SEQUENCE_ID_FORMAT, '-j', '-k', USER_FILTER_LIST] if handle_contaminants == 'report' or handle_contaminants == 'remove': lanl_hivnetworkcsv_process.extend (['-C', handle_contaminants, '-F', CONTAMINANT_ID_LIST]) logging.debug(' '.join(lanl_hivnetworkcsv_process)) # hivclustercsv uses stderr for status updates complete_stderr = '' with subprocess.Popen(lanl_hivnetworkcsv_process, stdout=lanl_output_cluster_json_fh, stderr=PIPE, bufsize=1, universal_newlines=True) as p: for line in p.stderr: complete_stderr += line update_status(id, phases.PUBLIC_INFERRING_CONNECTIONS, status.RUNNING, complete_stderr) p.wait() if p.returncode != 0: raise subprocess.CalledProcessError(returncode, ' '.join(lanl_hivnetworkcsv_process), complete_stderr) lanl_output_cluster_json_fh.close() update_status(id, phases.PUBLIC_INFERRING_CONNECTIONS, status.COMPLETED) #Annotate LANL nodes with id json_info = open(LANL_OUTPUT_CLUSTER_JSON, 'r').read() if json_info: # Only include clusters that are connected to supplied nodes annotate_lanl(LANL_OUTPUT_CLUSTER_JSON, LANL_FASTA) lanl_trace_results = json.loads(json_info) results_json['lanl_trace_results'] = lanl_trace_results else: logging.debug('no lanl results!') DEVNULL.close() return results_json
def hivtrace(id, input, reference, ambiguities, threshold, min_overlap, compare_to_lanl, fraction, strip_drams_flag = False, filter_edges = "no", handle_contaminants = "remove", skip_alignment = False): """ PHASE 1) Pad sequence alignment to HXB2 length with bealign PHASE 2) Convert resulting bam file back to FASTA format PHASE 2b) Rename any duplicates in FASTA file PHASE 3) Remove HXB2 and NL43 sequences PHASE 3b) Strip Drams if requested PHASE 4) TN93 analysis on the supplied FASTA file alone PHASE 5) Run hivclustercsv to return clustering information in JSON format PHASE 5b) Attribute annotations to results from (4) PHASE 6) Run tn93 against LANL if user elects to PHASE 6b) Concatenate results from pre-run LANL tn93, user tn93, and (5) analyses PHASE 6c) Flag any potential HXB2 sequences PHASE 7) Run hivclustercsv to return clustering information in json format """ results_json = {} # Declare reference file resource_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'rsrc') #These should be defined in the user's environment env_dir = os.path.dirname(sys.executable) PYTHON=sys.executable BEALIGN=os.path.join(env_dir, 'bealign') BAM2MSA=os.path.join(env_dir, 'bam2msa') TN93DIST='tn93' HIVNETWORKCSV=os.path.join(env_dir, 'hivnetworkcsv') # This will have to be another parameter LANL_FASTA = os.path.join(resource_dir, 'LANL.FASTA') LANL_TN93OUTPUT_CSV = os.path.join(resource_dir, 'LANL.TN93OUTPUT.csv') DEFAULT_DELIMITER='|' # Check if LANL files exists. If not, then check if zip file exists, # otherwise throw error try : if not os.path.isfile(LANL_FASTA): lanl_zip = os.path.join(resource_dir, 'LANL.FASTA.gz') gunzip_file(lanl_zip, LANL_FASTA) if not os.path.isfile(LANL_TN93OUTPUT_CSV): lanl_tn93output_zip = os.path.join(resource_dir, 'LANL.TN93OUTPUT.csv.gz') gunzip_file(lanl_tn93output_zip, LANL_TN93OUTPUT_CSV) except e: # pragma: no cover print("Oops, missing a resource file") raise # Python Parameters SCORE_MATRIX='HIV_BETWEEN_F' OUTPUT_FORMAT='csv' SEQUENCE_ID_FORMAT='plain' # Intermediate filenames tmp_path = tempfile.mkdtemp(prefix='hivtrace-') basename = os.path.basename(input) BAM_FN = os.path.join(tmp_path, basename+'_output.bam') OUTPUT_FASTA_FN = input+'_output.fasta' OUTPUT_TN93_FN = os.path.join(tmp_path, basename+'_user.tn93output.csv') JSON_TN93_FN = os.path.join(tmp_path, basename+'_user.tn93output.json') OUTPUT_COMBINED_SEQUENCE_FILE = os.path.join(tmp_path, basename+"_combined_user_lanl.fasta") OUTPUT_CLUSTER_JSON = os.path.join(tmp_path, basename+'_user.trace.json') LANL_OUTPUT_CLUSTER_JSON = os.path.join(tmp_path, basename+'_lanl_user.trace.json') OUTPUT_USERTOLANL_TN93_FN = os.path.join(tmp_path, basename+'_usertolanl.tn93output.csv') USER_LANL_TN93OUTPUT = os.path.join(tmp_path, basename+'_userlanl.tn93output.csv') USER_FILTER_LIST = os.path.join(tmp_path, basename+'_user_filter.csv') CONTAMINANT_ID_LIST = os.path.join(tmp_path, basename+'_contaminants.csv') # File handler for output we don't care about DEVNULL = open(os.devnull, 'w') EXCLUSION_LIST = None # Check for incompatible statement if skip_alignment and compare_to_lanl: raise Exception("You have passed arguments that are incompatible! You cannot compare to the public database if you elect to submit a pre-made alignment! Please consider the issue before trying again.") if skip_alignment: # Check for equal length in all sequences seqs = fasta_iter(input) seq_length = len(seqs.__next__()[1]) if(any(len(seq[1]) != seq_length for seq in seqs)): raise Exception("Not all input sequences have the same length!") # copy input file to output fasta file shutil.copyfile(input, OUTPUT_FASTA_FN) else: # PHASE 1 update_status(id, phases.ALIGNING, status.RUNNING) if handle_contaminants is None: handle_contaminants = 'no' bealign_process = [BEALIGN, '-q', '-r', reference , '-m', SCORE_MATRIX, '-R', input, BAM_FN] if handle_contaminants != 'no': bealign_process.insert (-3, '-K') logging.debug(' '.join(bealign_process)) subprocess.check_call(bealign_process, stdout=DEVNULL) update_status(id, phases.ALIGNING, status.COMPLETED) # PHASE 2 update_status(id, phases.BAM_FASTA_CONVERSION, status.RUNNING) bam_process = [BAM2MSA, BAM_FN, OUTPUT_FASTA_FN] logging.debug(' '.join(bam_process)) subprocess.check_call(bam_process, stdout=DEVNULL) update_status(id, phases.BAM_FASTA_CONVERSION, status.COMPLETED) if handle_contaminants != 'no': with (open (OUTPUT_FASTA_FN, 'r')) as msa: reference_name = next (SeqIO.parse (msa, 'fasta')).id logging.debug ('Reference name set to %s' % reference_name) with open (CONTAMINANT_ID_LIST, 'w') as contaminants: print (reference_name, file = contaminants) # Ensure unique ids # Warn of duplicates by annotating with an attribute rename_duplicates(OUTPUT_FASTA_FN, DEFAULT_DELIMITER) attribute_map = ('SOURCE', 'SUBTYPE', 'COUNTRY', 'ACCESSION_NUMBER', 'YEAR_OF_SAMPLING') # PHASE 3 # Strip HXB2 and NL43 linked sequences #if REFERENCE_FASTA: # strip_reference_sequences(OUTPUT_FASTA_FN, REFERENCE_FASTA, TN93DIST, threshold, ambiguities, min_overlap) if strip_drams_flag: #update_status(id, "Masking DRAM sites") OUTPUT_FASTA_FN_TMP = OUTPUT_FASTA_FN + ".spool" with open (str(OUTPUT_FASTA_FN_TMP),'w') as output_file: for (seq_id, data) in sd.strip_drams (OUTPUT_FASTA_FN, strip_drams_flag): print (">%s\n%s" % (seq_id, data), file = output_file) shutil.move (OUTPUT_FASTA_FN_TMP, OUTPUT_FASTA_FN) # PHASE 4 update_status(id, phases.COMPUTE_TN93_DISTANCE, status.RUNNING) with open(JSON_TN93_FN, 'w') as tn93_fh: tn93_process = [TN93DIST, '-q', '-o', OUTPUT_TN93_FN, '-t', threshold, '-a', ambiguities, '-l', min_overlap, '-g', fraction if ambiguities == 'resolve' else '1.0', '-f', OUTPUT_FORMAT, OUTPUT_FASTA_FN] logging.debug(' '.join(tn93_process)) subprocess.check_call(tn93_process,stdout=tn93_fh,stderr=tn93_fh) update_status(id, phases.COMPUTE_TN93_DISTANCE, status.COMPLETED) # send contents of tn93 to status page id_dict = id_to_attributes(OUTPUT_TN93_FN, attribute_map, DEFAULT_DELIMITER) if type(id_dict) is ValueError: update_status(id, "Error: " + id_dict.args[0]) raise id_dict # PHASE 5 update_status(id, phases.INFERRING_NETWORK, status.RUNNING) output_cluster_json_fh = open(OUTPUT_CLUSTER_JSON, 'w') hivnetworkcsv_process = [HIVNETWORKCSV, '-i', OUTPUT_TN93_FN, '-t', threshold, '-f', SEQUENCE_ID_FORMAT, '-j'] if filter_edges and filter_edges != 'no': hivnetworkcsv_process.extend (['-n',filter_edges, '-s', OUTPUT_FASTA_FN]) if handle_contaminants != 'no': hivnetworkcsv_process.extend (['-C', handle_contaminants, '-F', CONTAMINANT_ID_LIST]) # hivclustercsv uses stderr for status updates complete_stderr = '' returncode = None logging.debug(' '.join(hivnetworkcsv_process)) with subprocess.Popen(hivnetworkcsv_process, stdout=output_cluster_json_fh, stderr=PIPE, bufsize=1, universal_newlines=True) as p: for line in p.stderr: complete_stderr += line update_status(id, phases.INFERRING_NETWORK, status.RUNNING, complete_stderr) p.wait() if p.returncode != 0: raise subprocess.CalledProcessError(returncode, ' '.join(hivnetworkcsv_process), complete_stderr) update_status(id, phases.INFERRING_NETWORK, status.COMPLETED, complete_stderr) output_cluster_json_fh.close() # Read and print output_cluster_json results_json["trace_results"] = json.loads(open(OUTPUT_CLUSTER_JSON, 'r').read()) if not compare_to_lanl: return results_json if compare_to_lanl: # PHASE 6 update_status(id, phases.PUBLIC_COMPUTE_TN93_DISTANCE, status.RUNNING) lanl_tn93_process = '' if ambiguities != 'resolve': lanl_tn93_process = [TN93DIST, '-q', '-o', OUTPUT_USERTOLANL_TN93_FN, '-t', threshold, '-a', ambiguities, '-f', OUTPUT_FORMAT, '-l', min_overlap, '-s', LANL_FASTA, OUTPUT_FASTA_FN] else: lanl_tn93_process = [TN93DIST, '-q', '-o', OUTPUT_USERTOLANL_TN93_FN, '-t', threshold, '-a', ambiguities, '-f', OUTPUT_FORMAT, '-g', fraction, '-l', min_overlap, '-s', LANL_FASTA, OUTPUT_FASTA_FN] logging.debug(' '.join(lanl_tn93_process)) subprocess.check_call(lanl_tn93_process, stdout=DEVNULL) update_status(id, phases.PUBLIC_COMPUTE_TN93_DISTANCE, status.COMPLETED) # send contents of tn93 to status page # PHASE 6b #Perform concatenation #This is where reference annotation becomes an issue concatenate_data(USER_LANL_TN93OUTPUT, LANL_TN93OUTPUT_CSV, OUTPUT_USERTOLANL_TN93_FN, OUTPUT_TN93_FN) lanl_id_dict = id_to_attributes(OUTPUT_TN93_FN, attribute_map, DEFAULT_DELIMITER) # Create a list from TN93 csv for hivnetworkcsv filter create_filter_list(OUTPUT_TN93_FN, USER_FILTER_LIST) # PHASE 7 update_status(id,phases.PUBLIC_INFERRING_CONNECTIONS, status.RUNNING) lanl_output_cluster_json_fh = open(LANL_OUTPUT_CLUSTER_JSON, 'w') if filter_edges and filter_edges != 'no': with open (OUTPUT_COMBINED_SEQUENCE_FILE, 'w') as combined_fasta: for f_path in (LANL_FASTA, OUTPUT_FASTA_FN): with open (f_path) as src_file: shutil.copyfileobj (src_file,combined_fasta) print ("\n", file = combined_fasta) lanl_hivnetworkcsv_process = [PYTHON, HIVNETWORKCSV, '-i', USER_LANL_TN93OUTPUT, '-t', threshold, '-f', SEQUENCE_ID_FORMAT, '-j', '-k', USER_FILTER_LIST, '-n', filter_edges, '-s', OUTPUT_COMBINED_SEQUENCE_FILE ] else: lanl_hivnetworkcsv_process = [PYTHON, HIVNETWORKCSV, '-i', USER_LANL_TN93OUTPUT, '-t', threshold, '-f', SEQUENCE_ID_FORMAT, '-j', '-k', USER_FILTER_LIST] if handle_contaminants != 'no': lanl_hivnetworkcsv_process.extend (['-C', handle_contaminants, '-F', CONTAMINANT_ID_LIST]) logging.debug(' '.join(lanl_hivnetworkcsv_process)) # hivclustercsv uses stderr for status updates complete_stderr = '' with subprocess.Popen(lanl_hivnetworkcsv_process, stdout=lanl_output_cluster_json_fh, stderr=PIPE, bufsize=1, universal_newlines=True) as p: for line in p.stderr: complete_stderr += line update_status(id, phases.PUBLIC_INFERRING_CONNECTIONS, status.RUNNING, complete_stderr) p.wait() if p.returncode != 0: raise subprocess.CalledProcessError(returncode, ' '.join(lanl_hivnetworkcsv_process), complete_stderr) lanl_output_cluster_json_fh.close() update_status(id, phases.PUBLIC_INFERRING_CONNECTIONS, status.COMPLETED) #Annotate LANL nodes with id json_info = open(LANL_OUTPUT_CLUSTER_JSON, 'r').read() if json_info: # Only include clusters that are connected to supplied nodes annotate_lanl(LANL_OUTPUT_CLUSTER_JSON, LANL_FASTA) lanl_trace_results = json.loads(json_info) results_json['lanl_trace_results'] = lanl_trace_results else: logging.debug('no lanl results!') DEVNULL.close() return results_json