Exemple #1
0
    def test_strip_drams(self):

        #run the whole thing and make sure it completed via the status file
        results = strip_drams(self.fn, 'lewis')
        self.assertTrue(results.__next__()[1][120:123] == '---')
        self.assertTrue(results.__next__()[1][672:675] == '---')
        self.assertFalse(results.__next__()[1][687:690] == '---')

        results = strip_drams(self.fn, 'wheeler')
        self.assertTrue(results.__next__()[1][129:132] == '---')
        self.assertTrue(results.__next__()[1][687:690] == '---')

        results = hivtrace.hivtrace(id, self.fn, self.reference,
                                    self.ambiguities, self.distance_threshold,
                                    self.min_overlap, False, '0.025', 'lewis')

        self.assertTrue(results["trace_results"])

        return
Exemple #2
0
  def test_strip_drams(self):

    #run the whole thing and make sure it completed via the status file
    results = strip_drams(self.fn, 'lewis')
    self.assertTrue(results.__next__()[1][120:123] == '---')
    self.assertTrue(results.__next__()[1][672:675] == '---')
    self.assertFalse(results.__next__()[1][687:690] == '---')

    results = strip_drams(self.fn, 'wheeler')
    self.assertTrue(results.__next__()[1][129:132] == '---')
    self.assertTrue(results.__next__()[1][687:690] == '---')

    results = hivtrace.hivtrace(id, self.fn, self.reference, self.ambiguities,
                      self.distance_threshold, self.min_overlap,
                      False, '0.025', 'lewis')

    self.assertTrue(results["trace_results"])


    return
Exemple #3
0
def hivtrace(id, input, reference, ambiguities, threshold, min_overlap,
             compare_to_lanl, fraction, strip_drams_flag = False, filter_edges = "no",
             handle_contaminants = "remove", skip_alignment = False, attributes_file = None):

    """
    PHASE 1)  Pad sequence alignment to HXB2 length with bealign
    PHASE 2)  Convert resulting bam file back to FASTA format
    PHASE 2b) Rename any duplicates in FASTA file
    PHASE 3)  Strip DRAMs if requested
    PHASE 3b) Filtering contaminants before TN93 run if requested
    PHASE 4)  TN93 analysis on the supplied FASTA file alone
    PHASE 5)  Run hivclustercsv to return clustering information in JSON format
    PHASE 5b) Attribute annotations to results from (4)
    PHASE 6)  Run tn93 against LANL if user elects to
    PHASE 6b) Concatenate results from pre-run LANL tn93, user tn93, and (5) analyses
    PHASE 6c) Flag any potential HXB2 sequences
    PHASE 7)  Run hivclustercsv to return clustering information in json format
    """

    results_json = {}

    # Declare reference file
    resource_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'rsrc')

    # These should be defined in the user's environment
    env_dir = os.path.dirname(sys.executable)
    PYTHON = sys.executable

    # Try python's system executable first, then the user's path.

    if(os.path.isfile(os.path.join(env_dir, 'bealign'))):
        BEALIGN = os.path.join(env_dir, 'bealign')
    else:
        BEALIGN = 'bealign'

    if(os.path.isfile(os.path.join(env_dir, 'bam2msa'))):
        BAM2MSA = os.path.join(env_dir, 'bam2msa')
    else:
        BAM2MSA = 'bam2msa'

    if(os.path.isfile(os.path.join(env_dir, 'hivnetworkcsv'))):
        HIVNETWORKCSV = os.path.join(env_dir, 'hivnetworkcsv')
    else:
        HIVNETWORKCSV = 'hivnetworkcsv'

    TN93DIST = 'tn93'

    # This will have to be another parameter
    LANL_FASTA = os.path.join(resource_dir, 'LANL.FASTA')
    LANL_TN93OUTPUT_CSV = os.path.join(resource_dir, 'LANL.TN93OUTPUT.csv')
    DEFAULT_DELIMITER = '|'

    # Check if LANL files exists. If not, then check if zip file exists,
    # otherwise throw error
    try:
        if not os.path.isfile(LANL_FASTA):
            lanl_zip = os.path.join(resource_dir, 'LANL.FASTA.gz')
            gunzip_file(lanl_zip, LANL_FASTA)

        if not os.path.isfile(LANL_TN93OUTPUT_CSV):
            lanl_tn93output_zip = os.path.join(resource_dir, 'LANL.TN93OUTPUT.csv.gz')
            gunzip_file(lanl_tn93output_zip, LANL_TN93OUTPUT_CSV)
    except e: # pragma: no cover
        print("Oops, missing a resource file")
        raise


    # Python Parameters
    SCORE_MATRIX='HIV_BETWEEN_F'
    OUTPUT_FORMAT='csv'
    SEQUENCE_ID_FORMAT='plain'

    # Intermediate filenames
    tmp_path = tempfile.mkdtemp(prefix='hivtrace-')
    basename = os.path.basename(input)

    BAM_FN                        = os.path.join(tmp_path, basename+'_output.bam')
    OUTPUT_FASTA_FN               = input+'_output.fasta'
    OUTPUT_TN93_FN                = os.path.join(tmp_path, basename+'_user.tn93output.csv')
    OUTPUT_TN93_CONTAM_FN         = os.path.join(tmp_path, basename+'_contam.tn93output.csv')
    DEST_TN93_FN                  = input+'_user.tn93output.csv'
    JSON_TN93_FN                  = os.path.join(tmp_path, basename+'_user.tn93output.json')
    JSON_TN93_CONTAM_FN           = os.path.join(tmp_path, basename+'_contam.tn93output.json')
    OUTPUT_COMBINED_SEQUENCE_FILE = os.path.join(tmp_path, basename+"_combined_user_lanl.fasta")
    OUTPUT_CLUSTER_JSON           = os.path.join(tmp_path, basename+'_user.trace.json')
    LANL_OUTPUT_CLUSTER_JSON      = os.path.join(tmp_path, basename+'_lanl_user.trace.json')
    OUTPUT_USERTOLANL_TN93_FN     = os.path.join(tmp_path, basename+'_usertolanl.tn93output.csv')
    USER_LANL_TN93OUTPUT          = os.path.join(tmp_path, basename+'_userlanl.tn93output.csv')
    USER_FILTER_LIST              = os.path.join(tmp_path, basename+'_user_filter.csv')
    CONTAMINANT_ID_LIST           = os.path.join(tmp_path, basename+'_contaminants.csv')

    # File handler for output we don't care about
    DEVNULL = open(os.devnull, 'w')

    EXCLUSION_LIST = None

    # Check for incompatible statements

    if skip_alignment and compare_to_lanl:
        raise Exception("You have passed arguments that are incompatible! You cannot compare to the public database if you elect to submit a pre-made alignment! Please consider the issue before trying again.")

    if skip_alignment:

        # Check for equal length in all sequences
        seqs = fasta_iter(input)
        seq_length = len(seqs.__next__()[1])

        if(any(len(seq[1]) != seq_length for seq in seqs)):
            raise Exception("Not all input sequences have the same length!")

        # copy input file to output fasta file
        shutil.copyfile(input, OUTPUT_FASTA_FN)

    else:
        # PHASE 1
        update_status(id, phases.ALIGNING, status.RUNNING)

        if handle_contaminants is None:
            handle_contaminants  = 'no'

        bealign_process = [BEALIGN, '-q', '-r', reference , '-m', SCORE_MATRIX, '-R', input, BAM_FN]

        if handle_contaminants != 'no':
            bealign_process.insert (-3, '-K')

        logging.debug(' '.join(bealign_process))
        subprocess.check_call(bealign_process, stdout=DEVNULL)
        update_status(id, phases.ALIGNING, status.COMPLETED)

        # PHASE 2
        update_status(id, phases.BAM_FASTA_CONVERSION, status.RUNNING)
        bam_process = [BAM2MSA, BAM_FN, OUTPUT_FASTA_FN]
        logging.debug(' '.join(bam_process))
        subprocess.check_call(bam_process, stdout=DEVNULL)
        update_status(id, phases.BAM_FASTA_CONVERSION, status.COMPLETED)

    if handle_contaminants != 'no' and handle_contaminants !='separately':
        with (open (OUTPUT_FASTA_FN, 'r')) as msa:
            reference_name = next (SeqIO.parse (msa, 'fasta')).id
            logging.debug ('Reference name set to %s' % reference_name)
            with open (CONTAMINANT_ID_LIST, 'w') as contaminants:
                print (reference_name, file = contaminants)


    # Ensure unique ids
    # Warn of duplicates by annotating with an attribute
    rename_duplicates(OUTPUT_FASTA_FN, DEFAULT_DELIMITER)
    attribute_map = ('SOURCE', 'SUBTYPE', 'COUNTRY', 'ACCESSION_NUMBER', 'YEAR_OF_SAMPLING')

    # PHASE 3
    # Strip DRAMS
    if strip_drams_flag:
        OUTPUT_FASTA_FN_TMP = OUTPUT_FASTA_FN + ".spool"
        with open (str(OUTPUT_FASTA_FN_TMP),'w') as output_file:
            for (seq_id, data) in sd.strip_drams (OUTPUT_FASTA_FN, strip_drams_flag):
                print (">%s\n%s" % (seq_id, data), file = output_file)
        shutil.move (OUTPUT_FASTA_FN_TMP, OUTPUT_FASTA_FN)

    # PHASE 3b Filter contaminants
    if handle_contaminants == 'separately':

        update_status(id, phases.FILTER_CONTAMINANTS, status.RUNNING)

        with open(JSON_TN93_CONTAM_FN, 'w') as tn93_contam_fh:
            tn93_contam_process = [ TN93DIST,
                            '-q',
                            '-o', OUTPUT_TN93_CONTAM_FN,
                            '-t', '0.015',
                            '-a', 'resolve',
                            '-l', min_overlap,
                            '-g', '1.0',
                            '-s', reference,
                            '-f', OUTPUT_FORMAT,
                            OUTPUT_FASTA_FN ]

            logging.debug(' '.join(tn93_contam_process))
            subprocess.check_call(tn93_contam_process,stdout=tn93_contam_fh,stderr=tn93_contam_fh)
            # shutil.copyfile(OUTPUT_TN93_FN, DEST_TN93_FN)
            update_status(id, phases.FILTER_CONTAMINANTS, status.COMPLETED)

        # Process output for contaminants and remove them from the file
        # Store the contaminants for reporting later
        with open(OUTPUT_TN93_CONTAM_FN, 'r') as tn93_contam_fh:
            tn93reader = csv.reader(tn93_contam_fh, delimiter=',', quotechar='|')
            tn93reader.__next__()
            contams = [row[0] for row in tn93reader]

            OUTPUT_FASTA_FN_TMP = OUTPUT_FASTA_FN + ".contam.tmp"

            # Remove contams from FASTA file
            with (open (OUTPUT_FASTA_FN, 'r')) as msa_fn:
                msa = SeqIO.parse (msa_fn, 'fasta')
                filtered_msa = filter(lambda x: x.id not in contams, msa)
                # Write to new TMP file
                with open(OUTPUT_FASTA_FN_TMP, "w") as output_handle:
                        SeqIO.write(filtered_msa, output_handle, "fasta")

            shutil.move (OUTPUT_FASTA_FN_TMP, OUTPUT_FASTA_FN)

    # PHASE 4
    update_status(id, phases.COMPUTE_TN93_DISTANCE, status.RUNNING)

    with open(JSON_TN93_FN, 'w') as tn93_fh:
        tn93_process = [TN93DIST, '-q', '-o', OUTPUT_TN93_FN, '-t',
                               threshold, '-a', ambiguities, '-l',
                               min_overlap, '-g', fraction if ambiguities == 'resolve' else '1.0', '-f', OUTPUT_FORMAT, OUTPUT_FASTA_FN]

        logging.debug(' '.join(tn93_process))
        subprocess.check_call(tn93_process,stdout=tn93_fh,stderr=tn93_fh)
        shutil.copyfile(OUTPUT_TN93_FN, DEST_TN93_FN)
        update_status(id, phases.COMPUTE_TN93_DISTANCE, status.COMPLETED)

    # send contents of tn93 to status page

    id_dict = id_to_attributes(OUTPUT_TN93_FN, attribute_map, DEFAULT_DELIMITER)
    if type(id_dict) is ValueError:
        update_status(id, "Error: " + id_dict.args[0])
        raise id_dict

    # PHASE 5
    update_status(id, phases.INFERRING_NETWORK, status.RUNNING)

    output_cluster_json_fh = open(OUTPUT_CLUSTER_JSON, 'w')

    hivnetworkcsv_process = [HIVNETWORKCSV, '-i', OUTPUT_TN93_FN, '-t',
                                   threshold, '-f', SEQUENCE_ID_FORMAT, '-j', '-o']

    if filter_edges and filter_edges != 'no':
        hivnetworkcsv_process.extend (['-n', filter_edges, '-s', OUTPUT_FASTA_FN])

    if handle_contaminants == 'report' or handle_contaminants == 'remove':
        hivnetworkcsv_process.extend (['-C', handle_contaminants, '-F', CONTAMINANT_ID_LIST])

    # hivclustercsv uses stderr for status updates
    complete_stderr = ''
    returncode = None

    logging.debug(' '.join(hivnetworkcsv_process))

    with subprocess.Popen(hivnetworkcsv_process, stdout=output_cluster_json_fh, stderr=PIPE, bufsize=1, universal_newlines=True) as p:
        for line in p.stderr:
            complete_stderr += line
            update_status(id, phases.INFERRING_NETWORK, status.RUNNING, complete_stderr)
        p.wait()

    if p.returncode != 0:
        raise subprocess.CalledProcessError(returncode, ' '.join(hivnetworkcsv_process), complete_stderr)

    update_status(id, phases.INFERRING_NETWORK, status.COMPLETED, complete_stderr)
    output_cluster_json_fh.close()

    # Read and print output_cluster_json
    results_json["trace_results"] = json.loads(open(OUTPUT_CLUSTER_JSON, 'r').read())

    # Get singletons
    singletons = get_singleton_nodes(results_json['trace_results']['Nodes'], input)

    results_json['trace_results']['Singletons'] = singletons

    # Place singleton count in Network Summary
    results_json['trace_results']['Network Summary']['Singletons'] = len(singletons)

    # Place contaminant nodes in Network Summary
    if handle_contaminants == 'separately':
        results_json['trace_results']['Network Summary']['contaminant_sequences'] = contams

    if attributes_file != None and attributes_file != False:
        annotate_file_attributes(results_json['trace_results'], attributes_file, 'ehars_uid')

    if not compare_to_lanl:
        return results_json

    if compare_to_lanl:

      # PHASE 6
      update_status(id, phases.PUBLIC_COMPUTE_TN93_DISTANCE, status.RUNNING)
      lanl_tn93_process = ''

      if ambiguities != 'resolve':
          lanl_tn93_process = [TN93DIST, '-q', '-o', OUTPUT_USERTOLANL_TN93_FN, '-t',
                                 threshold, '-a', ambiguities,
                                 '-f', OUTPUT_FORMAT, '-l', min_overlap, '-s',
                                 LANL_FASTA, OUTPUT_FASTA_FN]
      else:
          lanl_tn93_process = [TN93DIST, '-q', '-o', OUTPUT_USERTOLANL_TN93_FN, '-t',
                               threshold, '-a', ambiguities,
                               '-f', OUTPUT_FORMAT, '-g', fraction, '-l',
                               min_overlap, '-s', LANL_FASTA,
                               OUTPUT_FASTA_FN]


      logging.debug(' '.join(lanl_tn93_process))
      subprocess.check_call(lanl_tn93_process, stdout=DEVNULL)
      update_status(id, phases.PUBLIC_COMPUTE_TN93_DISTANCE, status.COMPLETED)

      # send contents of tn93 to status page

      # PHASE 6b
      # Perform concatenation
      # This is where reference annotation becomes an issue
      concatenate_data(USER_LANL_TN93OUTPUT, LANL_TN93OUTPUT_CSV,
                       OUTPUT_USERTOLANL_TN93_FN, OUTPUT_TN93_FN)


      lanl_id_dict = id_to_attributes(OUTPUT_TN93_FN, attribute_map, DEFAULT_DELIMITER)

      # Create a list from TN93 csv for hivnetworkcsv filter
      create_filter_list(OUTPUT_TN93_FN, USER_FILTER_LIST)


      # PHASE 7
      update_status(id,phases.PUBLIC_INFERRING_CONNECTIONS, status.RUNNING)
      lanl_output_cluster_json_fh = open(LANL_OUTPUT_CLUSTER_JSON, 'w')


      if filter_edges and filter_edges != 'no':
         with open (OUTPUT_COMBINED_SEQUENCE_FILE, 'w') as combined_fasta:
            for f_path in (LANL_FASTA, OUTPUT_FASTA_FN):
                with open (f_path) as src_file:
                    shutil.copyfileobj (src_file,combined_fasta)
                    print ("\n", file = combined_fasta)

         lanl_hivnetworkcsv_process = [PYTHON, HIVNETWORKCSV, '-i', USER_LANL_TN93OUTPUT, '-t',
                                        threshold, '-f', SEQUENCE_ID_FORMAT, '-j', '-k', USER_FILTER_LIST,
                                        '-n', filter_edges, '-s', OUTPUT_COMBINED_SEQUENCE_FILE
                                        ]

      else:
          lanl_hivnetworkcsv_process = [PYTHON, HIVNETWORKCSV, '-i', USER_LANL_TN93OUTPUT, '-t',
                                        threshold, '-f', SEQUENCE_ID_FORMAT, '-j', '-k', USER_FILTER_LIST]

      if handle_contaminants == 'report' or handle_contaminants == 'remove':
          lanl_hivnetworkcsv_process.extend (['-C', handle_contaminants, '-F', CONTAMINANT_ID_LIST])

      logging.debug(' '.join(lanl_hivnetworkcsv_process))

      # hivclustercsv uses stderr for status updates
      complete_stderr = ''
      with subprocess.Popen(lanl_hivnetworkcsv_process, stdout=lanl_output_cluster_json_fh, stderr=PIPE, bufsize=1, universal_newlines=True) as p:
          for line in p.stderr:
              complete_stderr += line
              update_status(id, phases.PUBLIC_INFERRING_CONNECTIONS, status.RUNNING, complete_stderr)
          p.wait()

      if p.returncode != 0:
        raise subprocess.CalledProcessError(returncode, ' '.join(lanl_hivnetworkcsv_process), complete_stderr)


      lanl_output_cluster_json_fh.close()

      update_status(id, phases.PUBLIC_INFERRING_CONNECTIONS, status.COMPLETED)

      #Annotate LANL nodes with id
      json_info = open(LANL_OUTPUT_CLUSTER_JSON, 'r').read()

      if json_info:
        # Only include clusters that are connected to supplied nodes
        annotate_lanl(LANL_OUTPUT_CLUSTER_JSON, LANL_FASTA)
        lanl_trace_results = json.loads(json_info)
        results_json['lanl_trace_results'] = lanl_trace_results
      else:
        logging.debug('no lanl results!')


    DEVNULL.close()
    return results_json
Exemple #4
0
def hivtrace(id, input, reference, ambiguities, threshold, min_overlap,
             compare_to_lanl, fraction, strip_drams_flag = False, filter_edges = "no",
             handle_contaminants = "remove", skip_alignment = False):

    """
    PHASE 1)  Pad sequence alignment to HXB2 length with bealign
    PHASE 2)  Convert resulting bam file back to FASTA format
    PHASE 2b) Rename any duplicates in FASTA file
    PHASE 3)  Remove HXB2 and NL43 sequences
    PHASE 3b) Strip Drams if requested
    PHASE 4)  TN93 analysis on the supplied FASTA file alone
    PHASE 5)  Run hivclustercsv to return clustering information in JSON format
    PHASE 5b) Attribute annotations to results from (4)
    PHASE 6)  Run tn93 against LANL if user elects to
    PHASE 6b) Concatenate results from pre-run LANL tn93, user tn93, and (5) analyses
    PHASE 6c) Flag any potential HXB2 sequences
    PHASE 7)  Run hivclustercsv to return clustering information in json format
    """

    results_json = {}

    # Declare reference file
    resource_dir =  os.path.join(os.path.dirname(os.path.realpath(__file__)), 'rsrc')


    #These should be defined in the user's environment
    env_dir = os.path.dirname(sys.executable)
    PYTHON=sys.executable
    BEALIGN=os.path.join(env_dir, 'bealign')
    BAM2MSA=os.path.join(env_dir, 'bam2msa')
    TN93DIST='tn93'
    HIVNETWORKCSV=os.path.join(env_dir, 'hivnetworkcsv')

    # This will have to be another parameter
    LANL_FASTA = os.path.join(resource_dir, 'LANL.FASTA')
    LANL_TN93OUTPUT_CSV = os.path.join(resource_dir, 'LANL.TN93OUTPUT.csv')
    DEFAULT_DELIMITER='|'

    # Check if LANL files exists. If not, then check if zip file exists,
    # otherwise throw error
    try :
        if not os.path.isfile(LANL_FASTA):
            lanl_zip = os.path.join(resource_dir, 'LANL.FASTA.gz')
            gunzip_file(lanl_zip, LANL_FASTA)

        if not os.path.isfile(LANL_TN93OUTPUT_CSV):
            lanl_tn93output_zip = os.path.join(resource_dir, 'LANL.TN93OUTPUT.csv.gz')
            gunzip_file(lanl_tn93output_zip, LANL_TN93OUTPUT_CSV)
    except e: # pragma: no cover
        print("Oops, missing a resource file")
        raise


    # Python Parameters
    SCORE_MATRIX='HIV_BETWEEN_F'
    OUTPUT_FORMAT='csv'
    SEQUENCE_ID_FORMAT='plain'

    # Intermediate filenames
    tmp_path = tempfile.mkdtemp(prefix='hivtrace-')
    basename = os.path.basename(input)

    BAM_FN                        = os.path.join(tmp_path, basename+'_output.bam')
    OUTPUT_FASTA_FN               = input+'_output.fasta'
    OUTPUT_TN93_FN                = os.path.join(tmp_path, basename+'_user.tn93output.csv')
    JSON_TN93_FN                  = os.path.join(tmp_path, basename+'_user.tn93output.json')
    OUTPUT_COMBINED_SEQUENCE_FILE = os.path.join(tmp_path, basename+"_combined_user_lanl.fasta")
    OUTPUT_CLUSTER_JSON           = os.path.join(tmp_path, basename+'_user.trace.json')
    LANL_OUTPUT_CLUSTER_JSON      = os.path.join(tmp_path, basename+'_lanl_user.trace.json')
    OUTPUT_USERTOLANL_TN93_FN     = os.path.join(tmp_path, basename+'_usertolanl.tn93output.csv')
    USER_LANL_TN93OUTPUT          = os.path.join(tmp_path, basename+'_userlanl.tn93output.csv')
    USER_FILTER_LIST              = os.path.join(tmp_path, basename+'_user_filter.csv')
    CONTAMINANT_ID_LIST           = os.path.join(tmp_path, basename+'_contaminants.csv')

    # File handler for output we don't care about
    DEVNULL = open(os.devnull, 'w')

    EXCLUSION_LIST = None

    # Check for incompatible statement
    if skip_alignment and compare_to_lanl:
        raise Exception("You have passed arguments that are incompatible! You cannot compare to the public database if you elect to submit a pre-made alignment! Please consider the issue before trying again.")

    if skip_alignment:

        # Check for equal length in all sequences
        seqs = fasta_iter(input)
        seq_length = len(seqs.__next__()[1])

        if(any(len(seq[1]) != seq_length for seq in seqs)):
            raise Exception("Not all input sequences have the same length!")

        # copy input file to output fasta file
        shutil.copyfile(input, OUTPUT_FASTA_FN)

    else:
        # PHASE 1
        update_status(id, phases.ALIGNING, status.RUNNING)

        if handle_contaminants is None:
            handle_contaminants  = 'no'

        bealign_process = [BEALIGN, '-q', '-r', reference , '-m', SCORE_MATRIX, '-R', input, BAM_FN]

        if handle_contaminants != 'no':
            bealign_process.insert (-3, '-K')

        logging.debug(' '.join(bealign_process))
        subprocess.check_call(bealign_process, stdout=DEVNULL)
        update_status(id, phases.ALIGNING, status.COMPLETED)

        # PHASE 2
        update_status(id, phases.BAM_FASTA_CONVERSION, status.RUNNING)
        bam_process = [BAM2MSA, BAM_FN, OUTPUT_FASTA_FN]
        logging.debug(' '.join(bam_process))
        subprocess.check_call(bam_process, stdout=DEVNULL)
        update_status(id, phases.BAM_FASTA_CONVERSION, status.COMPLETED)

    if handle_contaminants != 'no':
        with (open (OUTPUT_FASTA_FN, 'r')) as msa:
            reference_name = next (SeqIO.parse (msa, 'fasta')).id
            logging.debug ('Reference name set to %s' % reference_name)
            with open (CONTAMINANT_ID_LIST, 'w') as contaminants:
                print (reference_name, file = contaminants)


    # Ensure unique ids
    # Warn of duplicates by annotating with an attribute
    rename_duplicates(OUTPUT_FASTA_FN, DEFAULT_DELIMITER)
    attribute_map = ('SOURCE', 'SUBTYPE', 'COUNTRY', 'ACCESSION_NUMBER', 'YEAR_OF_SAMPLING')

    # PHASE 3
    # Strip HXB2 and NL43 linked sequences
    #if REFERENCE_FASTA:
    #    strip_reference_sequences(OUTPUT_FASTA_FN, REFERENCE_FASTA, TN93DIST, threshold, ambiguities, min_overlap)

    if strip_drams_flag:
        #update_status(id, "Masking DRAM sites")
        OUTPUT_FASTA_FN_TMP = OUTPUT_FASTA_FN + ".spool"
        with open (str(OUTPUT_FASTA_FN_TMP),'w') as output_file:
            for (seq_id, data) in sd.strip_drams (OUTPUT_FASTA_FN, strip_drams_flag):
                print (">%s\n%s" % (seq_id, data), file = output_file)

        shutil.move (OUTPUT_FASTA_FN_TMP, OUTPUT_FASTA_FN)

    # PHASE 4
    update_status(id, phases.COMPUTE_TN93_DISTANCE, status.RUNNING)

    with open(JSON_TN93_FN, 'w') as tn93_fh:
        tn93_process = [TN93DIST, '-q', '-o', OUTPUT_TN93_FN, '-t',
                               threshold, '-a', ambiguities, '-l',
                               min_overlap, '-g', fraction if ambiguities == 'resolve' else '1.0', '-f', OUTPUT_FORMAT, OUTPUT_FASTA_FN]

        logging.debug(' '.join(tn93_process))
        subprocess.check_call(tn93_process,stdout=tn93_fh,stderr=tn93_fh)
        update_status(id, phases.COMPUTE_TN93_DISTANCE, status.COMPLETED)

    # send contents of tn93 to status page

    id_dict = id_to_attributes(OUTPUT_TN93_FN, attribute_map, DEFAULT_DELIMITER)
    if type(id_dict) is ValueError:
        update_status(id, "Error: " + id_dict.args[0])
        raise id_dict

    # PHASE 5
    update_status(id, phases.INFERRING_NETWORK, status.RUNNING)

    output_cluster_json_fh = open(OUTPUT_CLUSTER_JSON, 'w')

    hivnetworkcsv_process = [HIVNETWORKCSV, '-i', OUTPUT_TN93_FN, '-t',
                                   threshold, '-f', SEQUENCE_ID_FORMAT, '-j']

    if filter_edges and filter_edges != 'no':
        hivnetworkcsv_process.extend (['-n',filter_edges, '-s', OUTPUT_FASTA_FN])

    if handle_contaminants != 'no':
        hivnetworkcsv_process.extend (['-C', handle_contaminants, '-F', CONTAMINANT_ID_LIST])

    # hivclustercsv uses stderr for status updates
    complete_stderr = ''
    returncode = None

    logging.debug(' '.join(hivnetworkcsv_process))

    with subprocess.Popen(hivnetworkcsv_process, stdout=output_cluster_json_fh, stderr=PIPE, bufsize=1, universal_newlines=True) as p:
        for line in p.stderr:
            complete_stderr += line
            update_status(id, phases.INFERRING_NETWORK, status.RUNNING, complete_stderr)
        p.wait()

    if p.returncode != 0:
        raise subprocess.CalledProcessError(returncode, ' '.join(hivnetworkcsv_process), complete_stderr)

    update_status(id, phases.INFERRING_NETWORK, status.COMPLETED, complete_stderr)
    output_cluster_json_fh.close()

    # Read and print output_cluster_json
    results_json["trace_results"] = json.loads(open(OUTPUT_CLUSTER_JSON, 'r').read())

    if not compare_to_lanl:
        return results_json

    if compare_to_lanl:

      # PHASE 6

      update_status(id, phases.PUBLIC_COMPUTE_TN93_DISTANCE, status.RUNNING)
      lanl_tn93_process = ''

      if ambiguities != 'resolve':
          lanl_tn93_process = [TN93DIST, '-q', '-o', OUTPUT_USERTOLANL_TN93_FN, '-t',
                                 threshold, '-a', ambiguities,
                                 '-f', OUTPUT_FORMAT, '-l', min_overlap, '-s',
                                 LANL_FASTA, OUTPUT_FASTA_FN]
      else:
          lanl_tn93_process = [TN93DIST, '-q', '-o', OUTPUT_USERTOLANL_TN93_FN, '-t',
                               threshold, '-a', ambiguities,
                               '-f', OUTPUT_FORMAT, '-g', fraction, '-l',
                               min_overlap, '-s', LANL_FASTA,
                               OUTPUT_FASTA_FN]


      logging.debug(' '.join(lanl_tn93_process))
      subprocess.check_call(lanl_tn93_process, stdout=DEVNULL)
      update_status(id, phases.PUBLIC_COMPUTE_TN93_DISTANCE, status.COMPLETED)

      # send contents of tn93 to status page

      # PHASE 6b
      #Perform concatenation
      #This is where reference annotation becomes an issue
      concatenate_data(USER_LANL_TN93OUTPUT, LANL_TN93OUTPUT_CSV,
                       OUTPUT_USERTOLANL_TN93_FN, OUTPUT_TN93_FN)


      lanl_id_dict = id_to_attributes(OUTPUT_TN93_FN, attribute_map, DEFAULT_DELIMITER)

      # Create a list from TN93 csv for hivnetworkcsv filter
      create_filter_list(OUTPUT_TN93_FN, USER_FILTER_LIST)

      # PHASE 7
      update_status(id,phases.PUBLIC_INFERRING_CONNECTIONS, status.RUNNING)
      lanl_output_cluster_json_fh = open(LANL_OUTPUT_CLUSTER_JSON, 'w')


      if filter_edges and filter_edges != 'no':
         with open (OUTPUT_COMBINED_SEQUENCE_FILE, 'w') as combined_fasta:
            for f_path in (LANL_FASTA, OUTPUT_FASTA_FN):
                with open (f_path) as src_file:
                    shutil.copyfileobj (src_file,combined_fasta)
                    print ("\n", file = combined_fasta)

         lanl_hivnetworkcsv_process = [PYTHON, HIVNETWORKCSV, '-i', USER_LANL_TN93OUTPUT, '-t',
                                        threshold, '-f', SEQUENCE_ID_FORMAT, '-j', '-k', USER_FILTER_LIST,
                                        '-n', filter_edges, '-s', OUTPUT_COMBINED_SEQUENCE_FILE
                                        ]

      else:
          lanl_hivnetworkcsv_process = [PYTHON, HIVNETWORKCSV, '-i', USER_LANL_TN93OUTPUT, '-t',
                                        threshold, '-f', SEQUENCE_ID_FORMAT, '-j', '-k', USER_FILTER_LIST]

      if handle_contaminants != 'no':
          lanl_hivnetworkcsv_process.extend (['-C', handle_contaminants, '-F', CONTAMINANT_ID_LIST])

      logging.debug(' '.join(lanl_hivnetworkcsv_process))

      # hivclustercsv uses stderr for status updates
      complete_stderr = ''
      with subprocess.Popen(lanl_hivnetworkcsv_process, stdout=lanl_output_cluster_json_fh, stderr=PIPE, bufsize=1, universal_newlines=True) as p:
          for line in p.stderr:
              complete_stderr += line
              update_status(id, phases.PUBLIC_INFERRING_CONNECTIONS, status.RUNNING, complete_stderr)
          p.wait()

      if p.returncode != 0:
        raise subprocess.CalledProcessError(returncode, ' '.join(lanl_hivnetworkcsv_process), complete_stderr)


      lanl_output_cluster_json_fh.close()

      update_status(id, phases.PUBLIC_INFERRING_CONNECTIONS, status.COMPLETED)

      #Annotate LANL nodes with id
      json_info = open(LANL_OUTPUT_CLUSTER_JSON, 'r').read()

      if json_info:
        # Only include clusters that are connected to supplied nodes
        annotate_lanl(LANL_OUTPUT_CLUSTER_JSON, LANL_FASTA)
        lanl_trace_results = json.loads(json_info)
        results_json['lanl_trace_results'] = lanl_trace_results
      else:
        logging.debug('no lanl results!')

    DEVNULL.close()
    return results_json