Esempio n. 1
0
 def setUp(self):
     
     # create a list of files to cleanup
     self._paths_to_clean_up = []
     self._dirs_to_clean_up = []
     
     # load query seqs
     self.seqs = Alignment(parse_fasta(QUERY_SEQS.split()))
     
     # generate temp filename
     tmp_dir='/tmp'
     self.outfile = get_tmp_filename(tmp_dir)
     
     # create and write out reference sequence file
     self.outfasta=splitext(self.outfile)[0]+'.fasta'
     fastaout=open(self.outfasta,'w')
     fastaout.write(REF_SEQS)
     fastaout.close()
     self._paths_to_clean_up.append(self.outfasta)
     
     # create and write out starting tree file
     self.outtree=splitext(self.outfile)[0]+'.tree'
     treeout=open(self.outtree,'w')
     treeout.write(REF_TREE)
     treeout.close()
     self._paths_to_clean_up.append(self.outtree)
Esempio n. 2
0
def cluster_seqs(seqs,
                 neighbor_join=False,
                 params={},
                 add_seq_names=True,
                 WorkingDir=None,
                 SuppressStderr=None,
                 SuppressStdout=None,
                 max_chars=1000000,
                 max_hours=1.0,
                 constructor=PhyloNode,
                 clean_up=True
                 ):
    """Muscle cluster list of sequences.

    seqs: either file name or list of sequence objects or list of strings or
        single multiline string containing sequences.

    Addl docs coming soon
    """
    num_seqs = len(seqs)
    if num_seqs < 2:
        raise ValueError, "Muscle requres 2 or more sequences to cluster."


    num_chars = sum(map(len, seqs))
    if num_chars > max_chars:
        params["-maxiters"] = 2
        params["-diags1"] = True
        params["-sv"] = True
        #params["-distance1"] = "kmer6_6"
        #params["-distance1"] = "kmer20_3"
        #params["-distance1"] = "kbit20_3"
        print "lots of chars, using fast align", num_chars


    params["-maxhours"] = max_hours
    #params["-maxiters"] = 10

    #cluster_type = "upgmb"
    #if neighbor_join:
    #    cluster_type = "neighborjoining"

    params["-clusteronly"] = True
    params["-tree1"] = get_tmp_filename(WorkingDir)

    muscle_res = muscle_seqs(seqs,
                 params=params,
                 add_seq_names=add_seq_names,
                 WorkingDir=WorkingDir,
                 SuppressStderr=SuppressStderr,
                 SuppressStdout=SuppressStdout)

    tree = DndParser(muscle_res["Tree1Out"], constructor=constructor)

    if clean_up:
        muscle_res.cleanUp()
    return tree
Esempio n. 3
0
def build_tree_from_alignment(aln, moltype, best_tree=False, params={}):
    """Returns a tree from Alignment object aln.
    
    aln: an xxx.Alignment object, or data that can be used to build one.
    
    moltype: cogent.core.moltype.MolType object

    best_tree: best_tree suppport is currently not implemented
    
    params: dict of parameters to pass in to the RAxML app controller.
    
    The result will be an xxx.Alignment object, or None if tree fails.
    """
    if best_tree:
        raise NotImplementedError

    if '-m' not in params:
        if moltype == DNA or moltype == RNA:
            #params["-m"] = 'GTRMIX'
            # in version 7.2.3, GTRMIX is no longer supported but says GTRCAT
            # behaves like GTRMIX (http://www.phylo.org/tools/raxmlhpc2.html)
            params["-m"] = 'GTRGAMMA'
        elif moltype == PROTEIN:
            params["-m"] = 'PROTGAMMAmatrixName'
        else:
            raise ValueError, "Moltype must be either DNA, RNA, or PROTEIN"

    if not hasattr(aln, 'toPhylip'):
        aln = Alignment(aln)
    seqs, align_map = aln.toPhylip()

    # generate temp filename for output    
    params["-w"] = "/tmp/"    
    params["-n"] = get_tmp_filename().split("/")[-1]
    params["-k"] = True
    params["-p"] = randint(1,100000)
    params["-x"] = randint(1,100000)
    
    ih = '_input_as_multiline_string'    

    raxml_app = Raxml(params=params,
                      InputHandler=ih,
                      WorkingDir=None,
                      SuppressStderr=True,
                      SuppressStdout=True)
                      
    raxml_result = raxml_app(seqs)
    
    tree = DndParser(raxml_result['Bootstrap'], constructor=PhyloNode)
    
    for node in tree.tips():
        node.Name = align_map[node.Name]

    raxml_result.cleanUp()

    return tree
Esempio n. 4
0
def aln_tree_seqs(seqs,
                 input_handler=None,
                 tree_type='neighborjoining',
                 params={},
                 add_seq_names=True,
                 WorkingDir=None,
                 SuppressStderr=None,
                 SuppressStdout=None,
                 max_hours=5.0,
                 constructor=PhyloNode,
                 clean_up=True
                 ):
    """Muscle align sequences and report tree from iteration2.

    Unlike cluster_seqs, returns tree2 which is the tree made during the
    second muscle iteration (it should be more accurate that the cluster from
    the first iteration which is made fast based on  k-mer words)

    seqs: either file name or list of sequence objects or list of strings or
        single multiline string containing sequences.
    tree_type: can be either neighborjoining (default) or upgmb for UPGMA
    clean_up: When true, will clean up output files
    """

    params["-maxhours"] = max_hours
    if tree_type:
        params["-cluster2"] = tree_type
    params["-tree2"] = get_tmp_filename(WorkingDir)
    params["-out"] = get_tmp_filename(WorkingDir)

    muscle_res = muscle_seqs(seqs,
                 input_handler=input_handler,
                 params=params,
                 add_seq_names=add_seq_names,
                 WorkingDir=WorkingDir,
                 SuppressStderr=SuppressStderr,
                 SuppressStdout=SuppressStdout)
    tree = DndParser(muscle_res["Tree2Out"], constructor=constructor)
    aln = [line for line in muscle_res["MuscleOut"]]

    if clean_up:
        muscle_res.cleanUp()
    return tree, aln
Esempio n. 5
0
def raxml_alignment(align_obj,
                 raxml_model="GTRCAT",
                 params={},
                 SuppressStderr=True,
                 SuppressStdout=True):
    """Run raxml on alignment object 

    align_obj: Alignment object
    params: you can set any params except -w and -n

    returns: tuple (phylonode, 
                    parsimonyphylonode, 
                    log likelihood, 
                    total exec time)
    """

    # generate temp filename for output
    params["-w"] = "/tmp/"
    params["-n"] = get_tmp_filename().split("/")[-1]
    params["-m"] = raxml_model
    params["-p"] = randint(1,100000)
    ih = '_input_as_multiline_string'
    seqs, align_map = align_obj.toPhylip()
    
    #print params["-n"]

    # set up command
    raxml_app = Raxml(
                   params=params,
                   InputHandler=ih,
                   WorkingDir=None,
                   SuppressStderr=SuppressStderr,
                   SuppressStdout=SuppressStdout)

    # run raxml
    ra = raxml_app(seqs)

    # generate tree
    tree_node =  DndParser(ra["Result"])

    # generate parsimony tree
    parsimony_tree_node =  DndParser(ra["ParsimonyTree"])

    # extract log likelihood from log file
    log_file = ra["Log"]
    total_exec_time = exec_time = log_likelihood = 0.0
    for line in log_file:
        exec_time, log_likelihood = map(float, line.split())
        total_exec_time += exec_time

    # remove output files
    ra.cleanUp()

    return tree_node, parsimony_tree_node, log_likelihood, total_exec_time
Esempio n. 6
0
    def test_insert_sequences_into_tree(self):
        """Inserts sequences into Tree using params - test handles tree-insertion"""
        
        # generate temp filename for output
        outfname=splitext(get_tmp_filename('/tmp/'))[0]
        
        # create starting tree
        outtreefname=outfname+'.tre'
        outtree=open(outtreefname,'w')
        outtree.write(REF_TREE)
        outtree.close()
        
        # set params for tree-insertion
        params={}
        params["-w"]="/tmp/"
        params["-n"] = get_tmp_filename().split("/")[-1]
        params["-f"] = 'v'
        #params["-G"] = '0.25'
        params["-t"] = outtreefname
        params["-m"] = 'GTRGAMMA'
        
        aln_ref_query=get_align_for_phylip(StringIO(PHYLIP_FILE_DNA_REF_QUERY))
        aln = Alignment(aln_ref_query)
        seqs, align_map = aln.toPhylip()
        
        tree = insert_sequences_into_tree(seqs, DNA, params=params,
                                          write_log=False)
        
        for node in tree.tips():
            removed_query_str=re.sub('QUERY___','',str(node.Name))
            new_node_name=re.sub('___\d+','',str(removed_query_str))
            if new_node_name in align_map:
                node.Name = align_map[new_node_name]

        self.assertTrue(isinstance(tree, PhyloNode))
        self.assertEqual(tree.getNewick(with_distances=True),RESULT_TREE)
        self.assertEqual(len(tree.tips()), 7)
        self.assertRaises(NotImplementedError, build_tree_from_alignment, \
                         self.align1, RNA, True)
                         
        remove(outtreefname)
Esempio n. 7
0
def insert_sequences_into_tree(seqs, moltype, params={},
                                           write_log=True):
    """Insert sequences into Tree.
    
    aln: an xxx.Alignment object, or data that can be used to build one.
    
    moltype: cogent.core.moltype.MolType object
    
    params: dict of parameters to pass in to the RAxML app controller.
    
    The result will be a tree.
    """
    
    ih = '_input_as_multiline_string'    

    raxml_app = Raxml(params=params,
                      InputHandler=ih,
                      WorkingDir=None,
                      SuppressStderr=False,
                      SuppressStdout=False,
                      HALT_EXEC=False)
    
    raxml_result = raxml_app(seqs)
    
    # write a log file
    if write_log:
        log_fp = join(params["-w"],'log_raxml_'+split(get_tmp_filename())[-1])
        log_file=open(log_fp,'w')
        log_file.write(raxml_result['StdOut'].read())
        log_file.close()
    
    ''' 
    # getting setup since parsimony doesn't output tree..only jplace, however
    # it is currently corrupt
        
    # use guppy to convert json file into a placement tree
    guppy_params={'tog':None}

    new_tree=build_tree_from_json_using_params(raxml_result['json'].name, \
                                               output_dir=params["-w"], \
                                               params=guppy_params)
    '''
    
    # get tree from 'Result Names'
    new_tree=raxml_result['Result'].readlines()
    filtered_tree=re.sub('\[I\d+\]','',str(new_tree))
    tree = DndParser(filtered_tree, constructor=PhyloNode)

    raxml_result.cleanUp()

    return tree
Esempio n. 8
0
def build_tree_from_distance_matrix(matrix, best_tree=False, params={}, working_dir="/tmp"):
    """Returns a tree from a distance matrix.

    matrix: a square Dict2D object (cogent.util.dict2d)

    best_tree: if True (default:False), uses a slower but more accurate
    algorithm to build the tree.

    params: dict of parameters to pass in to the Clearcut app controller.

    The result will be an cogent.core.tree.PhyloNode object, or None if tree
    fails.
    """
    params["--out"] = get_tmp_filename(working_dir)

    # Create instance of app controller, enable tree, disable alignment
    app = Clearcut(
        InputHandler="_input_as_multiline_string",
        params=params,
        WorkingDir=working_dir,
        SuppressStdout=True,
        SuppressStderr=True,
    )
    # Turn off input as alignment
    app.Parameters["-a"].off()
    # Input is a distance matrix
    app.Parameters["-d"].on()

    if best_tree:
        app.Parameters["-N"].on()

    # Turn the dict2d object into the expected input format
    matrix_input, int_keys = _matrix_input_from_dict2d(matrix)

    # Collect result
    result = app(matrix_input)

    # Build tree
    tree = DndParser(result["Tree"].read(), constructor=PhyloNode)

    # reassign to original names
    for node in tree.tips():
        node.Name = int_keys[node.Name]

    # Clean up
    result.cleanUp()
    del (app, result, params)

    return tree
Esempio n. 9
0
def insert_sequences_into_tree(aln, moltype, params={},
                                           write_log=True):
    """Returns a tree from Alignment object aln.

    aln: an xxx.Alignment object, or data that can be used to build one.

    moltype: cogent.core.moltype.MolType object

    params: dict of parameters to pass in to the RAxML app controller.

    The result will be an xxx.Alignment object, or None if tree fails.
    """

    # convert aln to phy since seq_names need fixed to run through pplacer
    
    new_aln=get_align_for_phylip(StringIO(aln))

    # convert aln to fasta in case it is not already a fasta file
    aln2 = Alignment(new_aln)
    seqs = aln2.toFasta()

    ih = '_input_as_multiline_string'    

    pplacer_app = Pplacer(params=params,
                      InputHandler=ih,
                      WorkingDir=None,
                      SuppressStderr=False,
                      SuppressStdout=False)
    
    pplacer_result = pplacer_app(seqs)

    # write a log file
    if write_log:
        log_fp = join(params["--out-dir"],'log_pplacer_' + \
                      split(get_tmp_filename())[-1])
        log_file=open(log_fp,'w')
        log_file.write(pplacer_result['StdOut'].read())
        log_file.close()
        
    # use guppy to convert json file into a placement tree
    guppy_params={'tog':None}
    
    new_tree=build_tree_from_json_using_params(pplacer_result['json'].name, \
                                               output_dir=params['--out-dir'], \
                                               params=guppy_params)

    pplacer_result.cleanUp()
    
    return new_tree
Esempio n. 10
0
def build_tree_from_alignment(aln, moltype=DNA, best_tree=False, params=None):
    """Returns a tree from Alignment object aln.

    aln: a cogent.core.alignment.Alignment object, or data that can be used
    to build one.

    moltype: cogent.core.moltype.MolType object

    best_tree: unsupported

    params: dict of parameters to pass in to the Muscle app controller.

    The result will be an cogent.core.tree.PhyloNode object, or None if tree
    fails.
    """
    # Create instance of app controller, enable tree, disable alignment
    app = Muscle(InputHandler='_input_as_multiline_string', params=params, \
                   WorkingDir='/tmp')

    app.Parameters['-clusteronly'].on()
    app.Parameters['-tree1'].on(get_tmp_filename(app.WorkingDir))
    app.Parameters['-seqtype'].on(moltype.label)

    seq_collection = SequenceCollection(aln, MolType=moltype)

    #Create mapping between abbreviated IDs and full IDs
    int_map, int_keys = seq_collection.getIntMap()
    #Create SequenceCollection from int_map.
    int_map = SequenceCollection(int_map,MolType=moltype)


    # Collect result
    result = app(int_map.toFasta())

    # Build tree
    tree = DndParser(result['Tree1Out'].read(), constructor=PhyloNode)

    for tip in tree.tips():
        tip.Name = int_keys[tip.Name]

    # Clean up
    result.cleanUp()
    del(seq_collection, app, result)

    return tree
Esempio n. 11
0
def align_unaligned_seqs(seqs, moltype=DNA, params=None):
    """Returns an Alignment object from seqs.

    seqs: SequenceCollection object, or data that can be used to build one.

    moltype: a MolType object.  DNA, RNA, or PROTEIN.

    params: dict of parameters to pass in to the Muscle app controller.

    Result will be an Alignment object.
    """
    if not params:
        params = {}
    #create SequenceCollection object from seqs
    seq_collection = SequenceCollection(seqs,MolType=moltype)
    #Create mapping between abbreviated IDs and full IDs
    int_map, int_keys = seq_collection.getIntMap()
    #Create SequenceCollection from int_map.
    int_map = SequenceCollection(int_map,MolType=moltype)
    #get temporary filename
    params.update({'-out':get_tmp_filename()})
    #Create Muscle app.
    app = Muscle(InputHandler='_input_as_multiline_string',\
                 params=params)
    #Get results using int_map as input to app
    res = app(int_map.toFasta())
    #Get alignment as dict out of results
    alignment = dict(parse_fasta(res['MuscleOut']))
    #Make new dict mapping original IDs
    new_alignment = {}
    for k,v in alignment.items():
        new_alignment[int_keys[k]]=v
    #Create an Alignment object from alignment dict
    new_alignment = Alignment(new_alignment,MolType=moltype)
    #Clean up
    res.cleanUp()
    del(seq_collection,int_map,int_keys,app,res,alignment,params)

    return new_alignment
Esempio n. 12
0
 def setUp(self):
     '''setup the files for testing pplacer'''
     
     # create a list of files to cleanup
     self._paths_to_clean_up = []
     self._dirs_to_clean_up = []
     
     # get a tmp filename to use
     basename=splitext(get_tmp_filename())[0]
     
     # create and write out RAxML stats file
     self.stats_fname=basename+'.stats'
     stats_out=open(self.stats_fname,'w')
     stats_out.write(RAXML_STATS)
     stats_out.close()
     self._paths_to_clean_up.append(self.stats_fname)
     
     # create and write out reference sequence file
     self.refseq_fname=basename+'_refseqs.fasta'
     refseq_out=open(self.refseq_fname,'w')
     refseq_out.write(REF_SEQS)
     refseq_out.close()
     self._paths_to_clean_up.append(self.refseq_fname)
     
     # create and write out query sequence file
     self.query_fname=basename+'_queryseqs.fasta'
     query_out=open(self.query_fname,'w')
     query_out.write(QUERY_SEQS)
     query_out.close()
     self._paths_to_clean_up.append(self.query_fname)
     
     # create and write out starting tree file
     self.tree_fname=basename+'.tre'
     tree_out=open(self.tree_fname,'w')
     tree_out.write(REF_TREE)
     tree_out.close()
     self._paths_to_clean_up.append(self.tree_fname) 
Esempio n. 13
0
def build_tree_from_alignment(aln, moltype=DNA, best_tree=False, params={}, working_dir="/tmp"):
    """Returns a tree from Alignment object aln.

    aln: an cogent.core.alignment.Alignment object, or data that can be used
    to build one.
        -  Clearcut only accepts aligned sequences.  Alignment object used to
        handle unaligned sequences.

    moltype: a cogent.core.moltype object.
        - NOTE: If moltype = RNA, we must convert to DNA since Clearcut v1.0.8
        gives incorrect results if RNA is passed in.  'U' is treated as an
        incorrect character and is excluded from distance calculations.

    best_tree: if True (default:False), uses a slower but more accurate
    algorithm to build the tree.

    params: dict of parameters to pass in to the Clearcut app controller.

    The result will be an cogent.core.tree.PhyloNode object, or None if tree
    fails.
    """
    params["--out"] = get_tmp_filename(working_dir)

    # Create instance of app controller, enable tree, disable alignment
    app = Clearcut(
        InputHandler="_input_as_multiline_string",
        params=params,
        WorkingDir=working_dir,
        SuppressStdout=True,
        SuppressStderr=True,
    )
    # Input is an alignment
    app.Parameters["-a"].on()
    # Turn off input as distance matrix
    app.Parameters["-d"].off()

    # If moltype = RNA, we must convert to DNA.
    if moltype == RNA:
        moltype = DNA

    if best_tree:
        app.Parameters["-N"].on()

    # Turn on correct moltype
    moltype_string = moltype.label.upper()
    app.Parameters[MOLTYPE_MAP[moltype_string]].on()

    # Setup mapping. Clearcut clips identifiers. We will need to remap them.
    # Clearcut only accepts aligned sequences.  Let Alignment object handle
    # unaligned sequences.
    seq_aln = Alignment(aln, MolType=moltype)
    # get int mapping
    int_map, int_keys = seq_aln.getIntMap()
    # create new Alignment object with int_map
    int_map = Alignment(int_map)

    # Collect result
    result = app(int_map.toFasta())

    # Build tree
    tree = DndParser(result["Tree"].read(), constructor=PhyloNode)
    for node in tree.tips():
        node.Name = int_keys[node.Name]

    # Clean up
    result.cleanUp()
    del (seq_aln, app, result, int_map, int_keys, params)

    return tree
Esempio n. 14
0
def align_two_alignments(aln1, aln2, params=None):
    """Returns an Alignment object from two existing Alignments.

    aln1, aln2: cogent.core.alignment.Alignment objects, or data that can be
    used to build them.

    params: dict of parameters to pass in to the Muscle app controller.
    """
    if not params:
        params = {}

    #create SequenceCollection object from aln1
    aln1_collection = SequenceCollection(aln1)
    #Create mapping between abbreviated IDs and full IDs
    aln1_int_map, aln1_int_keys = aln1_collection.getIntMap(prefix='aln1_')
    #Create SequenceCollection from int_map.
    aln1_int_map = SequenceCollection(aln1_int_map)

    #create SequenceCollection object from aln2
    aln2_collection = SequenceCollection(aln2)
    #Create mapping between abbreviated IDs and full IDs
    aln2_int_map, aln2_int_keys = aln2_collection.getIntMap(prefix='aln2_')
    #Create SequenceCollection from int_map.
    aln2_int_map = SequenceCollection(aln2_int_map)

    #set output and profile options
    params.update({'-out':get_tmp_filename(), '-profile':True})

    #save aln1 to tmp file
    aln1_filename = get_tmp_filename()
    aln1_out = open(aln1_filename,'w')
    aln1_out.write(aln1_int_map.toFasta())
    aln1_out.close()

    #save aln2 to tmp file
    aln2_filename = get_tmp_filename()
    aln2_out = open(aln2_filename, 'w')
    aln2_out.write(aln2_int_map.toFasta())
    aln2_out.close()

    #Create Muscle app and get results
    app = Muscle(InputHandler='_input_as_multifile', params=params)
    res = app((aln1_filename, aln2_filename))

    #Get alignment as dict out of results
    alignment = dict(parse_fasta(res['MuscleOut']))

    #Make new dict mapping original IDs
    new_alignment = {}
    for k,v in alignment.items():
        if k in aln1_int_keys:
            new_alignment[aln1_int_keys[k]] = v
        else:
            new_alignment[aln2_int_keys[k]] = v

    #Create an Alignment object from alignment dict
    new_alignment = Alignment(new_alignment)

    #Clean up
    res.cleanUp()
    del(aln1_collection, aln1_int_map, aln1_int_keys)
    del(aln2_collection, aln2_int_map, aln2_int_keys)
    del(app, res, alignment, params)
    remove(aln1_filename)
    remove(aln2_filename)

    return new_alignment
Esempio n. 15
0
def assign_taxonomy(dataPath, reference_sequences_fp, id_to_taxonomy_fp, read_1_seqs_fp, read_2_seqs_fp, single_ok=False, no_single_ok_generic=False,
                    header_id_regex=None, read_id_regex = "\S+\s+(\S+)", amplicon_id_regex = "(\S+)\s+(\S+?)\/",
                    output_fp=None, log_path=None, HALT_EXEC=False, base_tmp_dir = '/tmp'):
    """Assign taxonomy to each sequence in data with the RTAX classifier

        # data: open fasta file object or list of fasta lines
        dataPath: path to a fasta file

        output_fp: path to write output; if not provided, result will be
         returned in a dict of {seq_id:(taxonomy_assignment,confidence)}
    """

    usearch_command = "usearch"
    if not (exists(usearch_command) or app_path(usearch_command)):
        raise ApplicationNotFoundError,\
         "Cannot find %s. Is it installed? Is it in your path?"\
         % usearch_command

    my_tmp_dir = get_tmp_filename(tmp_dir=base_tmp_dir,prefix='rtax_',suffix='',result_constructor=str)
    os.makedirs(my_tmp_dir)


    try:
        # RTAX classifier doesn't necessarily preserve identifiers
        # it reports back only the id extracted as $1 using header_id_regex
        # since rtax takes the original unclustered sequence files as input,
        # the usual case is that the regex extracts the amplicon ID from the second field



        # Use lookup table
        read_1_id_to_orig_id = {}
        readIdExtractor = re.compile(read_id_regex)  # OTU clustering produces ">clusterID read_1_id"
        data = open(dataPath,'r')
        for seq_id, seq in parse_fasta(data):
            # apply the regex
            extract = readIdExtractor.match(seq_id)
            if extract is None:
                stderr.write("Matched no ID with read_id_regex " + read_id_regex +" in '" + seq_id + "' from file " + dataPath + "\n")
            else:
                read_1_id_to_orig_id[extract.group(1)] = seq_id
                #stderr.write(extract.group(1) + " => " +  seq_id + "\n")
            #seq_id_lookup[seq_id.split()[1]] = seq_id
        data.close()



        # make list of amplicon IDs to pass to RTAX

        id_list_fp = open(my_tmp_dir+"/ampliconIdsToClassify", "w")

        # Establish mapping of amplicon IDs to read_1 IDs
        # simultaneously write the amplicon ID file for those IDs found in the input mapping above

        amplicon_to_read_1_id = {}
        ampliconIdExtractor = re.compile(amplicon_id_regex)  # split_libraries produces >read_1_id ampliconID/1 ...  // see also assign_taxonomy 631
        read_1_data = open(read_1_seqs_fp,'r')
        for seq_id, seq in parse_fasta(read_1_data):
            # apply the regex
            extract = ampliconIdExtractor.match(seq_id)
            if extract is None:
                stderr.write("Matched no ID with amplicon_id_regex " + amplicon_id_regex + " in '" + seq_id + "' from file " + read_1_seqs_fp + "\n")
            else:
                read_1_id = extract.group(1)
                amplicon_id = extract.group(2)
                try:
                    amplicon_to_read_1_id[amplicon_id] = read_1_id
                    bogus = read_1_id_to_orig_id[read_1_id]  # verify that the id is valid
                    id_list_fp.write('%s\n' % (amplicon_id))
                except KeyError:
                    pass
        data.close()
        id_list_fp.close()

        app = Rtax(HALT_EXEC=HALT_EXEC)

        temp_output_file = tempfile.NamedTemporaryFile(
            prefix='RtaxAssignments_', suffix='.txt')
        app.Parameters['-o'].on(temp_output_file.name)
        app.Parameters['-r'].on(reference_sequences_fp)
        app.Parameters['-t'].on(id_to_taxonomy_fp)
        # app.Parameters['-d'].on(delimiter)
        app.Parameters['-l'].on(id_list_fp.name)  # these are amplicon IDs
        app.Parameters['-a'].on(read_1_seqs_fp)
        if read_2_seqs_fp is not None:
            app.Parameters['-b'].on(read_2_seqs_fp)
        app.Parameters['-i'].on(header_id_regex)
        app.Parameters['-m'].on(my_tmp_dir)
        if single_ok: app.Parameters['-f'].on();
        if no_single_ok_generic: app.Parameters['-g'].on();
        #app.Parameters['-v'].on()

        app_result = app()

        if log_path:
            f=open(log_path, 'a')
            errString=''.join(app_result['StdErr'].readlines()) + '\n'
            f.write(errString)
            f.close()

        assignments = {}

        # restore original sequence IDs with spaces

        for line in app_result['Assignments']:
            toks = line.strip().split('\t')
            rtax_id = toks.pop(0)
            if len(toks):
                bestpcid = toks.pop(0)  # ignored
            lineage = toks

            # RTAX does not provide a measure of confidence.  We could pass one in,
            # based on the choice of primers, or even look it up on the fly in the tables
            # from the "optimal primers" paper; but it would be the same for every
            # query sequence anyway.
            # we could also return bestpcid, but that's not the same thing as confidence.
            confidence = 1.0

            read_1_id = amplicon_to_read_1_id[rtax_id]
            orig_id = read_1_id_to_orig_id[read_1_id]
            if lineage:
                assignments[orig_id] = (';'.join(lineage), confidence)
            else:
                assignments[orig_id] = ('Unclassified', 1.0)

        if output_fp:
            try:
                output_file = open(output_fp, 'w')
            except OSError:
                raise OSError("Can't open output file for writing: %s" % output_fp)
            for seq_id, assignment in assignments.items():
                lineage, confidence = assignment
                output_file.write(
                    '%s\t%s\t%1.3f\n' % (seq_id, lineage, confidence))
            output_file.close()
            return None
        else:
            return assignments
    finally:
        try:
            rmtree(my_tmp_dir)
        except OSError:
            pass