Example #1
0
def map_otu_map_files(otu_files, failures_file=None):
    # passing delim=None splits on any whitespace, so can handle mixed tabs
    # and spaces
    result = fields_to_dict(otu_files[0], delim=None)
    for otu_file in otu_files[1:]:
        current_otu_map = fields_to_dict(otu_file, delim=None)
        result = expand_otu_map_seq_ids(current_otu_map, result)
    if failures_file:
        result = expand_failures(failures_file, result)
    return result
Example #2
0
def map_otu_map_files(otu_files, failures_file=None):
    # passing delim=None splits on any whitespace, so can handle mixed tabs
    # and spaces
    result = fields_to_dict(otu_files[0], delim=None)
    for otu_file in otu_files[1:]:
        current_otu_map = fields_to_dict(otu_file, delim=None)
        result = expand_otu_map_seq_ids(current_otu_map, result)
    if failures_file:
        result = expand_failures(failures_file, result)
    return result
Example #3
0
def get_seqs_to_keep_lookup_from_otu_map(seqs_to_keep_f):
    """Generate a lookup dictionary from an OTU map"""
    otu_map = fields_to_dict(seqs_to_keep_f)
    seqs_to_keep = []
    for seq_ids in otu_map.values():
        seqs_to_keep += seq_ids
    return {}.fromkeys(seqs_to_keep)
Example #4
0
def get_seqs_to_keep_lookup_from_otu_map(seqs_to_keep_f):
    """Generate a lookup dictionary from an OTU map"""
    otu_map = fields_to_dict(seqs_to_keep_f)
    seqs_to_keep = []
    for seq_ids in otu_map.values():
        seqs_to_keep += seq_ids
    return {}.fromkeys(seqs_to_keep)
Example #5
0
    def test_parallel_rdp_taxonomy_assigner(self):
        """ parallel_rdp_taxonomy_assigner functions as expected """

        params = {
            'id_to_taxonomy_fp': self.id_to_taxonomy_file.name,
            'rdp_max_memory': 1500,
            'rdp_classifier_fp': getenv('RDP_JAR_PATH'),
            'confidence': 0.80,
            'reference_seqs_fp': self.reference_seqs_file.name
        }

        app = ParallelRdpTaxonomyAssigner()
        r = app(self.tmp_seq_filepath,
                self.test_out,
                params,
                job_prefix='RDPTEST',
                poll_directly=True,
                suppress_submit_jobs=False)
        results = fields_to_dict(
            open(glob(join(self.test_out, '*_tax_assignments.txt'))[0], 'U'))
        # some basic sanity checks: we should get the same number of sequences
        # as our input with the same seq IDs. We should have a taxonomy string
        # and a confidence value for each seq as well.
        self.assertEqual(len(results), 2)
        self.assertEqual(len(results['X67228']), 2)
        self.assertEqual(len(results['EF503697']), 2)
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)
       
    sample_id_map_fp = opts.sample_id_map_fp
    if sample_id_map_fp:
        sample_id_map = dict([(k,v[0]) \
         for k,v in fields_to_dict(open(sample_id_map_fp, "U")).items()])
    else:
        sample_id_map = None
    
    input_dm_fps = opts.input_dms.split(',')
    output_f = open(opts.output_fp,'w')
    output_f.write(comment)
    output_f.write('DM1\tDM2\tNumber of entries\tMantel p-value\n')
    num_iterations = opts.num_iterations
    for i,fp1 in enumerate(input_dm_fps):
        for fp2 in input_dm_fps[i+1:]:
            (dm1_labels, dm1), (dm2_labels, dm2) =\
             make_compatible_distance_matrices(parse_distmat(open(fp1,'U')),
                                               parse_distmat(open(fp2,'U')),
                                               lookup=sample_id_map)
            if len(dm1_labels) < 2:
                output_f.write('%s\t%s\t%d\tToo few samples\n' % (fp1,fp2,len(dm1_labels)))
                continue
            p = mantel(dm1,dm2,n=num_iterations)
            p_str = format_p_value_for_num_iters(p,num_iterations)
            output_f.write('%s\t%s\t%d\t%s\n' % (fp1,fp2,len(dm1_labels),p_str))
    output_f.close()
Example #7
0
def rewrite_otu_table_with_taxonomy(taxon_lines,
                                    otu_lines,
                                    id_map_lines=None,
                                    outfile=stdout):
    """Rewrites OTU table including taxonomy."""
    taxonomy = fields_to_dict(taxon_lines)
    #sometimes have extra fields after OTU id
    new_taxonomy = {}
    for k, v in taxonomy.items():
        new_taxonomy[k.split()[0]] = v
    taxonomy = new_taxonomy
    taxonomy = fix_taxonomy_delimiters(taxonomy)

    if id_map_lines:
        id_map = dict([map(strip, line.split('\t')) for line in id_map_lines])
        new_taxonomy = dict([(id_map[k], v) for k, v in taxonomy.items()
                             if k in id_map])
        assert new_taxonomy != taxonomy
        taxonomy = new_taxonomy

    for line in otu_lines:
        if not line.endswith('\n'):
            line += '\n'
        if line.startswith('#OTU ID'):
            outfile.write(line[:-1] + '\tConsensus Lineage\n')
        elif line.startswith('#'):
            outfile.write(line)
        else:
            id_, rest = line.split('\t', 1)
            t = taxonomy.get(id_, 'None')
            outfile.write(line[:-1] + '\t' + t + '\n')
Example #8
0
    def test_parallel_blast_taxonomy_assigner(self):
        """ parallel_blast_taxonomy_assigner functions as expected """
        params = {
            'id_to_taxonomy_fp': self.id_to_taxonomy_file.name,
            'blastmat_dir': None,
            'e_value': 0.001,
            'blast_db': None,
            'reference_seqs_fp': self.reference_seqs_file.name
        }

        app = ParallelBlastTaxonomyAssigner()
        r = app(self.tmp_seq_filepath,
                self.test_out,
                params,
                job_prefix='BTATEST',
                poll_directly=True,
                suppress_submit_jobs=False)
        results = fields_to_dict(
            open(glob(join(self.test_out, '*_tax_assignments.txt'))[0], 'U'))
        # some basic sanity checks: we should get the same number of sequences
        # as our input with the same seq IDs. We should have a taxonomy string
        # and a confidence value for each seq as well.
        self.assertEqual(len(results), 6)
        self.assertEqual(len(results['s1']), 3)
        self.assertEqual(len(results['s6']), 3)
Example #9
0
def rewrite_otu_table_with_taxonomy(taxon_lines, otu_lines, id_map_lines=None,
    outfile=stdout):
    """Rewrites OTU table including taxonomy."""
    taxonomy = fields_to_dict(taxon_lines)
    #sometimes have extra fields after OTU id
    new_taxonomy = {}
    for k, v in taxonomy.items():
        new_taxonomy[k.split()[0]] = v
    taxonomy = new_taxonomy
    taxonomy = fix_taxonomy_delimiters(taxonomy)

    if id_map_lines:
        id_map = dict([map(strip, line.split('\t')) for line in
            id_map_lines])
        new_taxonomy = dict([(id_map[k], v) for k, v in taxonomy.items()
            if k in id_map])
        assert new_taxonomy != taxonomy
        taxonomy = new_taxonomy

    for line in otu_lines:
        if not line.endswith('\n'):
            line += '\n'
        if line.startswith('#OTU ID'):
            outfile.write(line[:-1]+'\tConsensus Lineage\n')
        elif line.startswith('#'):
            outfile.write(line)
        else:
            id_, rest = line.split('\t', 1)
            t = taxonomy.get(id_, 'None')
            outfile.write(line[:-1]+'\t'+t+'\n')
Example #10
0
 def test_parallel_rdp_taxonomy_assigner(self):
     """ parallel_rdp_taxonomy_assigner functions as expected """
     
     params = {'id_to_taxonomy_fp':self.id_to_taxonomy_file.name,
       'rdp_max_memory':1500,
       'rdp_classifier_fp':getenv('RDP_JAR_PATH'),
       'confidence':0.80,
       'reference_seqs_fp':self.reference_seqs_file.name
     }
     
     app = ParallelRdpTaxonomyAssigner()
     r = app(self.tmp_seq_filepath,
             self.test_out,
             params,
             job_prefix='RDPTEST',
             poll_directly=True,
             suppress_submit_jobs=False)
     results = fields_to_dict(open(glob(join(
             self.test_out, '*_tax_assignments.txt'))[0], 'U'))
     # some basic sanity checks: we should get the same number of sequences
     # as our input with the same seq IDs. We should have a taxonomy string
     # and a confidence value for each seq as well.
     self.assertEqual(len(results), 2)
     self.assertEqual(len(results['X67228 some description']), 2)
     self.assertEqual(len(results['EF503697']), 2)
Example #11
0
    def test_parallel_uclust_taxonomy_assigner(self):
        """ parallel_uclust_taxonomy_assigner functions as expected """
        params = {
            'id_to_taxonomy_fp': self.id_to_taxonomy_file.name,
            'reference_seqs_fp': self.reference_seqs_file.name,
            'min_consensus_fraction': 0.51,
            'similarity': 0.90,
            'uclust_max_accepts': 3
        }

        app = ParallelUclustConsensusTaxonomyAssigner()
        r = app(self.tmp_seq_filepath,
                self.test_out,
                params,
                job_prefix='UTATEST',
                poll_directly=True,
                suppress_submit_jobs=False)
        results = fields_to_dict(
            open(glob(join(self.test_out, '*_tax_assignments.txt'))[0], 'U'))
        # some basic sanity checks: we should get the same number of sequences
        # as our input with the same seq IDs. We should have a taxonomy string
        # and a confidence value for each seq as well.
        self.assertEqual(len(results), 6)
        self.assertEqual(len(results['s1']), 3)
        self.assertEqual(len(results['s6']), 3)
def _parse_taxonomic_information(tax_map_lines, taxonomic_levels=8):
    """Parses a taxonomy mapping file to return mapping of seq ID to taxonomy.
    
    Returns a dictionary with sequence ID as the key and a list containing the
    taxonomy at each level. Empty taxonomic levels (i.e. ';;' or levels
    containing only whitespace) are ignored.

    Arguments:
        tax_map_lines - list of lines from the taxonomy mapping file (the
            result of calling readlines() on the open file handle)
        taxonomic_levels - the number of taxonomic levels in the taxonomy
            strings found in the taxonomy mapping file. All taxonomy strings
            must have this number of levels (excluding empty taxonomic levels)
    """
    tax_info = {}

    if tax_map_lines[0] != \
            "ID Number\tGenBank Number\tNew Taxon String\tSource\n":
        raise ValueError("The taxonomy map file appears to be invalid "
                         "because it is either missing the header or has a "
                         "corrupt header.")
    for seq_id, seq_info in fields_to_dict(tax_map_lines[1:]).items():
        if len(seq_info) != 3:
            raise ValueError("The taxonomy map file appears to be invalid "
                             "because it does not have exactly 4 columns.")
        # Split at each level and remove any empty levels or levels that
        # contain only whitespace.
        taxonomy = [level for level in seq_info[1].split(';') \
                    if level.strip() != '']
        if len(taxonomy) != taxonomic_levels:
            raise ValueError("Encountered invalid taxonomy '%s'. Valid "
                    "taxonomy strings must have %d levels separated by "
                    "semicolons." % (seq_info[1], taxonomic_levels))
        tax_info[seq_id] = taxonomy
    return tax_info
Example #13
0
    def test_write_otu_map_prefix(self):
        """write_otu_map functions as expected w otu prefix """
        write_otu_map(self.otu_map1, self.tmp_fp1, "my.otu.")
        actual = fields_to_dict(open(self.tmp_fp1))
        self.files_to_remove.append(self.tmp_fp1)

        exp = {"my.otu.0": ["seq1", "seq2", "seq5"], "my.otu.1": ["seq3", "seq4"], "my.otu.2": ["seq6", "seq7", "seq8"]}
        self.assertEqual(actual, exp)
Example #14
0
def _generate_taxonomic_agreement_summary(otu_map_lines,
                                          tax_map_lines,
                                          taxonomic_levels=8):
    """Computes a summary of taxonomic agreement between ref and its seqs.

    Returns a dictionary with OTU ID as the key. The value is a four-element
    list. The first element is the size of the OTU (i.e. the number of seqs in
    the OTU, including the reference). The second element is a list of sequence
    identifiers for each sequence in the OTU. The reference sequence ID will
    always be listed first, followed by the sequence IDs of the other members
    of the OTU as they appear in the OTU map. The third element is a list
    containing percent agreement at each taxonomic level (a list of floats).
    The fourth element is a list containing all taxonomic values that were
    encountered at each level. The reference taxonomic value will always be
    listed first. The third and fourth elements of the top-level list will
    always be the same length (taxonomic_levels) because they each contain
    information for each taxonomic level.

    Arguments:
        otu_map_lines - list of lines in the OTU map (the result of calling
            readlines() on the open file handle)
        tax_map_lines - list of lines from the taxonomy mapping file (the
            result of calling readlines() on the open file handle)
        taxonomic_levels - the number of taxonomic levels in the taxonomy
            strings found in the taxonomy mapping file. All taxonomy strings
            must have this number of levels to prevent inconsistent results in
            the summary
    """
    tax_map = _parse_taxonomic_information(tax_map_lines, taxonomic_levels)
    otu_map = fields_to_dict(otu_map_lines)

    taxonomic_agreement = {}
    for otu_id, seq_ids in otu_map.items():
        otu_size = len(seq_ids)
        taxonomic_agreement[otu_id] = [otu_size, seq_ids, [], []]

        # The reference sequence is always the first sequence listed in the OTU
        # map.
        ref_seq_id = seq_ids[0]
        ref_seq_tax = tax_map[ref_seq_id]

        # Calculate percent agreement for each taxonomic level. If the OTU only
        # contains a reference sequence, the percent agreement will be 100%.
        # Also keep track of all unique taxonomic values that are encountered
        # for each level (with the reference's taxonomic value listed first).
        for level_idx, ref_level in enumerate(ref_seq_tax):
            agreement_count = 0
            encountered_levels = []
            for seq_id in seq_ids:
                seq_level = tax_map[seq_id][level_idx]
                if ref_level == seq_level:
                    agreement_count += 1
                if seq_level not in encountered_levels:
                    encountered_levels.append(seq_level)
            taxonomic_agreement[otu_id][2].append(
                (agreement_count / otu_size) * 100)
            taxonomic_agreement[otu_id][3].append(encountered_levels)
    return taxonomic_agreement
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    # Create the output dir if it doesn't already exist.
    try:
        if not path.exists(opts.output_dir):
            create_dir(opts.output_dir)
    except:
        option_parser.error("Could not create or access output directory "
                            "specified with the -o option.")
    sample_id_map = None
    if opts.sample_id_map_fp:
        sample_id_map = dict([(k, v[0]) for k, v in fields_to_dict(
            open(opts.sample_id_map_fp, "U")).items()])
    input_dm_fps = opts.input_dms
    distmats = [parse_distmat(open(dm_fp, 'U')) for dm_fp in input_dm_fps]

    if opts.method == 'mantel':
        output_f = open(path.join(opts.output_dir, 'mantel_results.txt'), 'w')
        output_f.write(
            run_mantel_test('mantel',
                            input_dm_fps,
                            distmats,
                            opts.num_permutations,
                            opts.tail_type,
                            comment_mantel_pmantel,
                            sample_id_map=sample_id_map))
    elif opts.method == 'partial_mantel':
        output_f = open(
            path.join(opts.output_dir, 'partial_mantel_results.txt'), 'w')
        output_f.write(
            run_mantel_test('partial_mantel',
                            input_dm_fps,
                            distmats,
                            opts.num_permutations,
                            opts.tail_type,
                            comment_mantel_pmantel,
                            control_dm_fp=opts.control_dm,
                            control_dm=parse_distmat(open(
                                opts.control_dm, 'U')),
                            sample_id_map=sample_id_map))
    elif opts.method == 'mantel_corr':
        output_f = open(
            path.join(opts.output_dir, 'mantel_correlogram_results.txt'), 'w')
        result_str, correlogram_fps, correlograms = run_mantel_correlogram(
            input_dm_fps,
            distmats,
            opts.num_permutations,
            comment_corr,
            opts.alpha,
            sample_id_map=sample_id_map,
            variable_size_distance_classes=opts.variable_size_distance_classes)

        output_f.write(result_str)
        for corr_fp, corr in zip(correlogram_fps, correlograms):
            corr.savefig(path.join(opts.output_dir, corr_fp + opts.image_type),
                         format=opts.image_type)
    output_f.close()
def _generate_taxonomic_agreement_summary(otu_map_lines, tax_map_lines,
                                         taxonomic_levels=8):
    """Computes a summary of taxonomic agreement between ref and its seqs.

    Returns a dictionary with OTU ID as the key. The value is a four-element
    list. The first element is the size of the OTU (i.e. the number of seqs in
    the OTU, including the reference). The second element is a list of sequence
    identifiers for each sequence in the OTU. The reference sequence ID will
    always be listed first, followed by the sequence IDs of the other members
    of the OTU as they appear in the OTU map. The third element is a list
    containing percent agreement at each taxonomic level (a list of floats).
    The fourth element is a list containing all taxonomic values that were
    encountered at each level. The reference taxonomic value will always be
    listed first. The third and fourth elements of the top-level list will
    always be the same length (taxonomic_levels) because they each contain
    information for each taxonomic level.

    Arguments:
        otu_map_lines - list of lines in the OTU map (the result of calling
            readlines() on the open file handle)
        tax_map_lines - list of lines from the taxonomy mapping file (the
            result of calling readlines() on the open file handle)
        taxonomic_levels - the number of taxonomic levels in the taxonomy
            strings found in the taxonomy mapping file. All taxonomy strings
            must have this number of levels to prevent inconsistent results in
            the summary
    """
    tax_map = _parse_taxonomic_information(tax_map_lines, taxonomic_levels)
    otu_map = fields_to_dict(otu_map_lines)

    taxonomic_agreement = {}
    for otu_id, seq_ids in otu_map.items():
        otu_size = len(seq_ids)
        taxonomic_agreement[otu_id] = [otu_size, seq_ids, [], []]

        # The reference sequence is always the first sequence listed in the OTU
        # map.
        ref_seq_id = seq_ids[0]
        ref_seq_tax = tax_map[ref_seq_id]

        # Calculate percent agreement for each taxonomic level. If the OTU only
        # contains a reference sequence, the percent agreement will be 100%.
        # Also keep track of all unique taxonomic values that are encountered
        # for each level (with the reference's taxonomic value listed first).
        for level_idx, ref_level in enumerate(ref_seq_tax):
            agreement_count = 0
            encountered_levels = []
            for seq_id in seq_ids:
                seq_level = tax_map[seq_id][level_idx]
                if ref_level == seq_level:
                    agreement_count += 1
                if seq_level not in encountered_levels:
                    encountered_levels.append(seq_level)
            taxonomic_agreement[otu_id][2].append(
                    (agreement_count / otu_size) * 100)
            taxonomic_agreement[otu_id][3].append(encountered_levels)
    return taxonomic_agreement
Example #17
0
    def test_write_otu_map_prefix(self):
        """write_otu_map functions as expected w otu prefix """
        write_otu_map(self.otu_map1, self.tmp_fp1, 'my.otu.')
        actual = fields_to_dict(open(self.tmp_fp1))
        self.files_to_remove.append(self.tmp_fp1)

        exp = {'my.otu.0': ['seq1', 'seq2', 'seq5'],
               'my.otu.1': ['seq3', 'seq4'],
               'my.otu.2': ['seq6', 'seq7', 'seq8']}
        self.assertEqual(actual, exp)
Example #18
0
 def test_write_otu_map_prefix(self):
     """write_otu_map functions as expected w otu prefix """
     write_otu_map(self.otu_map1,self.tmp_fp1,'my.otu.')
     actual = fields_to_dict(open(self.tmp_fp1))
     self.files_to_remove.append(self.tmp_fp1)
     
     exp = {'my.otu.0':['seq1','seq2','seq5'],
            'my.otu.1':['seq3','seq4'],
            'my.otu.2':['seq6','seq7','seq8']}
     self.assertEqual(actual,exp)
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    # Create the output dir if it doesn't already exist.
    try:
        if not path.exists(opts.output_dir):
            create_dir(opts.output_dir)
    except:
        option_parser.error("Could not create or access output directory " "specified with the -o option.")
    sample_id_map = None
    if opts.sample_id_map_fp:
        sample_id_map = dict([(k, v[0]) for k, v in fields_to_dict(open(opts.sample_id_map_fp, "U")).items()])
    input_dm_fps = opts.input_dms
    distmats = [parse_distmat(open(dm_fp, "U")) for dm_fp in input_dm_fps]

    if opts.method == "mantel":
        output_f = open(path.join(opts.output_dir, "mantel_results.txt"), "w")
        output_f.write(
            run_mantel_test(
                "mantel",
                input_dm_fps,
                distmats,
                opts.num_permutations,
                opts.tail_type,
                comment_mantel_pmantel,
                sample_id_map=sample_id_map,
            )
        )
    elif opts.method == "partial_mantel":
        output_f = open(path.join(opts.output_dir, "partial_mantel_results.txt"), "w")
        output_f.write(
            run_mantel_test(
                "partial_mantel",
                input_dm_fps,
                distmats,
                opts.num_permutations,
                opts.tail_type,
                comment_mantel_pmantel,
                control_dm_fp=opts.control_dm,
                control_dm=parse_distmat(open(opts.control_dm, "U")),
                sample_id_map=sample_id_map,
            )
        )
    elif opts.method == "mantel_corr":
        output_f = open(path.join(opts.output_dir, "mantel_correlogram_results.txt"), "w")
        result_str, correlogram_fps, correlograms = run_mantel_correlogram(
            input_dm_fps, distmats, opts.num_permutations, comment_corr, opts.alpha, sample_id_map=sample_id_map
        )
        output_f.write(result_str)
        for corr_fp, corr in zip(correlogram_fps, correlograms):
            corr.savefig(path.join(opts.output_dir, corr_fp + opts.image_type), format=opts.image_type)
    output_f.close()
Example #20
0
    def test_fields_to_dict(self):
        """fields_to_dict should make first field key, rest val"""
        test_data = \
"""0	R27DLI_4812	R27DLI_600	R27DLI_727	U1PLI_403	U1PLI_8969	U1PLI_9080	U1PLI_9526	W3Cecum_6642	W3Cecum_8992
1	U1PLI_7889
2	W3Cecum_4858
3	R27DLI_3243	R27DLI_4562	R27DLI_6828	R27DLI_9097	U1PLI_2780	U1PLI_67	U9PSI_10475	U9PSI_4341	W3Cecum_5191""".splitlines()    #output from cd-hit
        obs = fields_to_dict(test_data)
        exp = {'0':['R27DLI_4812','R27DLI_600','R27DLI_727','U1PLI_403','U1PLI_8969','U1PLI_9080','U1PLI_9526','W3Cecum_6642','W3Cecum_8992'],
                '1':['U1PLI_7889'],
                '2':['W3Cecum_4858'],
                '3':['R27DLI_3243','R27DLI_4562','R27DLI_6828','R27DLI_9097','U1PLI_2780','U1PLI_67','U9PSI_10475','U9PSI_4341','W3Cecum_5191']}
        self.assertEqual(obs, exp)
Example #21
0
def main():
    """opens files as necessary based on prefs"""
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    data = {}

    fasta_file = opts.input_fasta_fp

    # load the input alignment
    data['aln'] = SequenceCollection.from_fasta_records(
        parse_fasta(open(fasta_file)), DNA)

    # Load the otu file
    otu_path = opts.otu_map_fp
    otu_f = open(otu_path, 'U')
    otus = fields_to_dict(otu_f)
    otu_f.close()

    data['otus'] = otus
    # Determine which which samples to extract from representative seqs
    # and from otus file
    if opts.samples_to_extract:
        prefs = process_extract_samples(opts.samples_to_extract)

    filepath = opts.input_fasta_fp
    filename = filepath.strip().split('/')[-1]
    filename = filename.split('.')[0]

    if opts.output_dir:
        if os.path.exists(opts.output_dir):
            dir_path = opts.output_dir
        else:
            try:
                os.mkdir(opts.output_dir)
                dir_path = opts.output_dir
            except OSError:
                pass
    else:
        dir_path = './'

    try:
        action = filter_samples
    except NameError:
        action = None
    # Place this outside try/except so we don't mask NameError in action
    if action:
        action(prefs, data, dir_path, filename)
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    exclude_otus_fp = opts.exclude_otus_fp
    
    if opts.output_fp:
        outfile = open(opts.output_fp, 'w')
    else:
        outfile = stdout
    if not opts.taxonomy_fname:
        otu_to_taxonomy = None
    else:
       infile = open(opts.taxonomy_fname,'U')
       otu_to_taxonomy = parse_taxonomy(infile)

    otu_to_seqid = fields_to_dict(open(opts.otu_map_fp, 'U'))
    
    if exclude_otus_fp:
        otu_to_seqid = remove_otus(otu_to_seqid,open(exclude_otus_fp,'U'))

    outfile.write(make_otu_table(otu_to_seqid, otu_to_taxonomy))
Example #23
0
    def test_fields_to_dict(self):
        """fields_to_dict should make first field key, rest val"""
        test_data = \
"""0	R27DLI_4812	R27DLI_600	R27DLI_727	U1PLI_403	U1PLI_8969	U1PLI_9080	U1PLI_9526	W3Cecum_6642	W3Cecum_8992
1	U1PLI_7889
2	W3Cecum_4858
3	R27DLI_3243	R27DLI_4562	R27DLI_6828	R27DLI_9097	U1PLI_2780	U1PLI_67	U9PSI_10475	U9PSI_4341	W3Cecum_5191""".splitlines()    #output from cd-hit
        obs = fields_to_dict(test_data)
        exp = {
            '0': [
                'R27DLI_4812', 'R27DLI_600', 'R27DLI_727', 'U1PLI_403',
                'U1PLI_8969', 'U1PLI_9080', 'U1PLI_9526', 'W3Cecum_6642',
                'W3Cecum_8992'
            ],
            '1': ['U1PLI_7889'],
            '2': ['W3Cecum_4858'],
            '3': [
                'R27DLI_3243', 'R27DLI_4562', 'R27DLI_6828', 'R27DLI_9097',
                'U1PLI_2780', 'U1PLI_67', 'U9PSI_10475', 'U9PSI_4341',
                'W3Cecum_5191'
            ]
        }
        self.assertEqual(obs, exp)
Example #24
0
    def test_parallel_uclust_taxonomy_assigner(self):
        """ parallel_uclust_taxonomy_assigner functions as expected """
        params = {'id_to_taxonomy_fp': self.id_to_taxonomy_file.name,
                  'reference_seqs_fp': self.reference_seqs_file.name,
                  'min_consensus_fraction': 0.51,
                  'similarity': 0.90,
                  'uclust_max_accepts': 3
                  }

        app = ParallelUclustConsensusTaxonomyAssigner()
        r = app(self.tmp_seq_filepath,
                self.test_out,
                params,
                job_prefix='UTATEST',
                poll_directly=True,
                suppress_submit_jobs=False)
        results = fields_to_dict(open(glob(join(
            self.test_out, '*_tax_assignments.txt'))[0], 'U'))
        # some basic sanity checks: we should get the same number of sequences
        # as our input with the same seq IDs. We should have a taxonomy string
        # and a confidence value for each seq as well.
        self.assertEqual(len(results), 6)
        self.assertEqual(len(results['s1']), 3)
        self.assertEqual(len(results['s6']), 3)
Example #25
0
 def test_parallel_blast_taxonomy_assigner(self):
     """ parallel_blast_taxonomy_assigner functions as expected """
     params = {'id_to_taxonomy_fp':self.id_to_taxonomy_file.name,
       'blastmat_dir':None,
       'e_value':0.001,
       'blast_db':None,
       'reference_seqs_fp':self.reference_seqs_file.name
     }
     
     app = ParallelBlastTaxonomyAssigner()
     r = app(self.tmp_seq_filepath,
             self.test_out,
             params,
             job_prefix='BTATEST',
             poll_directly=True,
             suppress_submit_jobs=False)
     results = fields_to_dict(open(glob(join(
             self.test_out, '*_tax_assignments.txt'))[0], 'U'))
     # some basic sanity checks: we should get the same number of sequences
     # as our input with the same seq IDs. We should have a taxonomy string
     # and a confidence value for each seq as well.
     self.assertEqual(len(results), 6)
     self.assertEqual(len(results['s1']), 3)
     self.assertEqual(len(results['s6']), 3)
Example #26
0
def _parse_taxonomic_information(tax_map_lines, taxonomic_levels=8):
    """Parses a taxonomy mapping file to return mapping of seq ID to taxonomy.
    
    Returns a dictionary with sequence ID as the key and a list containing the
    taxonomy at each level. Empty taxonomic levels (i.e. ';;' or levels
    containing only whitespace) are ignored.

    Arguments:
        tax_map_lines - list of lines from the taxonomy mapping file (the
            result of calling readlines() on the open file handle)
        taxonomic_levels - the number of taxonomic levels in the taxonomy
            strings found in the taxonomy mapping file. All taxonomy strings
            must have this number of levels (excluding empty taxonomic levels)
    """
    tax_info = {}

    if tax_map_lines[0] != \
            "ID Number\tGenBank Number\tNew Taxon String\tSource\n":
        raise ValueError("The taxonomy map file appears to be invalid "
                         "because it is either missing the header or has a "
                         "corrupt header.")
    for seq_id, seq_info in fields_to_dict(tax_map_lines[1:]).items():
        if len(seq_info) != 3:
            raise ValueError("The taxonomy map file appears to be invalid "
                             "because it does not have exactly 4 columns.")
        # Split at each level and remove any empty levels or levels that
        # contain only whitespace.
        taxonomy = [level for level in seq_info[1].split(';') \
                    if level.strip() != '']
        if len(taxonomy) != taxonomic_levels:
            raise ValueError(
                "Encountered invalid taxonomy '%s'. Valid "
                "taxonomy strings must have %d levels separated by "
                "semicolons." % (seq_info[1], taxonomic_levels))
        tax_info[seq_id] = taxonomy
    return tax_info
Example #27
0
 def test_write_otu_map(self):
     """write_otu_map functions as expected """
     write_otu_map(self.otu_map1,self.tmp_fp1)
     actual = fields_to_dict(open(self.tmp_fp1))
     self.files_to_remove.append(self.tmp_fp1)
     self.assertEqual(actual,dict(self.otu_map1))
Example #28
0
 def test_write_otu_map(self):
     """write_otu_map functions as expected """
     write_otu_map(self.otu_map1, self.tmp_fp1)
     actual = fields_to_dict(open(self.tmp_fp1))
     self.files_to_remove.append(self.tmp_fp1)
     self.assertEqual(actual, dict(self.otu_map1))
def load_otu_mapping(data_access, input_dir, analysis_id):
    """ Load the OTU table into the DB """
    
    # For OTU Tables
    # read in the workflow log file and determine timestamp and svn version of
    # Qiime used for the analysis
    pOTUs_threshold = '97'
    ref_set_threshold = '97'
    pOTUs_method='UCLUST_REF'
    reference_set_name='GREENGENES_REFERENCE'
    otus_log_str = open(join(input_dir, 'gg_97_otus', 'log.txt')).read()
    log_str = open(join(input_dir, 'gg_97_otus', 'log.txt')).readlines()
    
    #from the workflow log file get the pick-otus cmd
    for substr in log_str:
        if 'parallel_pick_otus_uclust_ref.py' in substr:
            pick_otus_cmd=substr
        elif 'pick_otus.py' in substr:
            pick_otus_cmd=substr
    
    # define values for otu_picking_run table
    otu_run_set_id = 0
    svn_version = '1418' # This is temporarily defined, however will use script to dtermine this value
    run_date=datetime.now().strftime("%d/%m/%Y/%H/%M/%S")    
    pick_otus_map = join(input_dir, 'gg_97_otus', 'exact_uclust_ref_otus.txt')
    
    # get md5 for split-lib seq file
    split_lib_seqs = join(input_dir, 'split_libraries', 'seqs.fna')
    split_lib_seqs_md5=safe_md5(open(split_lib_seqs)).hexdigest()
    
    # Insert the otu-picking log information in the DB
    print 'calling loadAllOTUInfo with analysis_id %s' % str(analysis_id)
    valid,new_otu_run_set_id,otu_picking_run_id=data_access.loadAllOTUInfo(True,
                                  otu_run_set_id, run_date,
                                  pOTUs_method, pOTUs_threshold,
                                  svn_version, pick_otus_cmd, otus_log_str,
                                  split_lib_seqs_md5,reference_set_name,
                                  ref_set_threshold, analysis_id)
    if not valid:
        raise ValueError, 'Error: Unable to load OTU run data into database!'
    else:
        print "Finished registering OTU run!"
    
    # define OTU mapping
    otu_map=[]
    otu_to_seqid = fields_to_dict(open(pick_otus_map, 'U'))
    for otu in otu_to_seqid:
        for sample in otu_to_seqid[otu]:
            otu_map.append('%s\t%s\t%s\t%s' % (otu,sample,new_otu_run_set_id, 
                                               reference_set_name))
    print 'Finished setting otu_map.'
    
    # define oracle data types
    types = ['s','s','i','s']
    con = data_access.getSFFDatabaseConnection()
    cur = con.cursor()
    #print 'Starting PK_SPLIT_LIBRARY_READ_MAP index rebuild...'
    #cur.execute('alter index "SFF"."PK_SPLIT_LIBRARY_READ_MAP" rebuild ')
    print 'Fisnished rebuilding index PK_SPLIT_LIBRARY_READ_MAP.'
    cur = con.cursor()
    set_count = 1
    
    # prepare the OTU table for laoding
    print 'Loading OTU Table into the database!'
    pick_otus_table = join(input_dir, 'gg_97_otus',
                           'exact_uclust_ref_otu_table.txt')
    otu_table_lines=open(pick_otus_table).readlines()
    sample_ids, otu_ids, otu_table, lineages = \
                                    parse_classic_otu_table(otu_table_lines)
    
    # convert OTU table to tab-delimited list
    otu_table_load=[]
    for i,otu in enumerate(otu_ids):
        for j,sample in enumerate(sample_ids):
            if otu_table[i][j]>0:
                otu_table_load.append("%s\t%s\t%s\t%s" % \
                                (otu,sample,new_otu_run_set_id,otu_table[i][j]))

    # get DB connection
    con = data_access.getSFFDatabaseConnection()
    cur = con.cursor()
    
    # load otu table into DB
    data_types=['s','s','i','f']   
    set_count = 0      
    for input_set in input_set_generator(otu_table_load, cur,data_types,\
                                         buffer_size=1000):
        valid=data_access.loadOTUTable(True,input_set)
        if not valid:
            raise ValueError, 'Error: Unable to load OTU table!'
        print "loading OTU Table: %s" % set_count
        set_count += 1
    
    print 'Successfully loaded the OTU Table into the database!'
    print 'End of function' 
Example #30
0
    def __call__(self,
                 seq_path,
                 otu_path,
                 reference_path,
                 result_path=None,
                 log_path=None,
                 sort_by='otu'):
        """Returns dict mapping {otu_id:[seq_ids]} for each otu.

        Parameters:
        seq_path: path to file of sequences
        otu_path: path to file of OTUs
        result_path: path to file of results. If specified,
        dumps the result to the desired path instead of returning it.
        log_path: path to log, which includes dump of params.
        sort_by: sort by otu or seq_id
        """
        # Load the seq path. We may want to change that in the future
        # to avoid the overhead of loading large sequence collections
        # during this step.
        if seq_path:
            seq_f = open(seq_path, 'U')
            seqs = dict(parse_fasta(seq_f, label_to_name=label_to_name))
            seq_f.close()
        else:
            # allows the user to not pass seqs, which can be useful when
            # all otus are based on reference sequences
            seqs = {}

        # Load the reference_path. We may want to change that in the future
        # to avoid the overhead of loading large sequence collections
        # during this step.
        reference_f = open(reference_path, 'U')
        reference_seqs = dict(
            parse_fasta(reference_f, label_to_name=label_to_name))
        reference_f.close()

        # Load the otu file
        otu_f = open(otu_path, 'U')
        otus = fields_to_dict(otu_f)
        otu_f.close()

        if self.Params['ChoiceFRequiresSeqs']:
            choice_f = self.Params['ChoiceF'](seqs)
        else:
            choice_f = self.Params['ChoiceF']

        # actually pick the set
        result = {}
        for set_id, ids in otus.items():
            if set_id in reference_seqs:
                result[set_id] = (reference_seqs, set_id)
            elif seqs:
                result[set_id] = (seqs, choice_f(ids, seqs))
            else:
                raise KeyError(
                    "Unknown reference sequence identifier: %s\n" % set_id +
                    "Have you provided the correct reference sequence file? " +
                    "Did you forget to provide a seqs filepath for de novo OTUs?"
                )

        if result_path:
            of = open(result_path, 'w')
            if sort_by == 'seq_id':

                def key(s):
                    try:
                        return int(s[1].split('_', 1)[-1])
                    except ValueError:
                        return s
            else:
                key = lambda s: s
            for cluster, rep in sorted(result.items(), key=key):
                seq_lookup, id_ = rep
                try:
                    of.write('>%s %s\n%s\n' % (cluster, id_, seq_lookup[id_]))
                except KeyError:
                    raise KeyError(
                        "Sequence identifiers (%s and %s) " % (cluster, id_) +
                        "not found in reference or sequence collection.")
            of.close()
            result = None
            log_str = 'Result path: %s' % result_path
        else:
            # The return value here differs from GenericRepSetPicker
            # because it is possible for the representative sequences
            # to be ambiguous. For example, if the identifiers in
            # seq_path and reference_path are both integers, returning
            # a sequence identifier is not sufficent to determine which
            # sequence collection the reference sequence came from.
            # Therefore if the user did not provide a result_path, store
            # the result in a dict of {otu_id: (rep_id, rep_seq)},
            log_str = 'Result path: None, returned as dict.'

            for cluster, rep in result.items():
                seq_lookup, id_ = rep
                try:
                    result[cluster] = (id_, seq_lookup[id_])
                except KeyError:
                    raise KeyError(
                        "Sequence identifiers (%s and %s) " % (cluster, id_) +
                        "not found in reference or sequence collection.")

        if log_path:
            # if the user provided a log file path, log the run
            log_file = open(log_path, 'w')
            log_file.write(str(self))
            log_file.write('\n')
            log_file.write('%s\n' % log_str)

        # return the result (note this is None if the data was
        # written to file)
        return result
Example #31
0
    def __call__(self,
                 seq_path,
                 otu_path,
                 result_path=None,
                 log_path=None,
                 sort_by='otu'):
        """Returns dict mapping {otu_id:[seq_ids]} for each otu.

        Parameters:
        seq_path: path to file of sequences
        otu_path: path to file of OTUs
        result_path: path to file of results. If specified,
        dumps the result to the desired path instead of returning it.
        log_path: path to log, which includes dump of params.
        sort_by: sort by otu or seq_id
        """
        # Load the seq path. We may want to change that in the future
        # to avoid the overhead of loading large sequence collections
        # during this step.
        seq_f = open(seq_path, 'U')
        seqs = dict(parse_fasta(seq_f, label_to_name=label_to_name))
        seq_f.close()

        # Load the otu file
        otu_f = open(otu_path, 'U')
        otus = fields_to_dict(otu_f)
        otu_f.close()

        if self.Params['ChoiceFRequiresSeqs']:
            choice_f = self.Params['ChoiceF'](seqs)
        else:
            choice_f = self.Params['ChoiceF']

        # actually pick the set
        result = {}
        for set_id, ids in otus.items():
            result[set_id] = choice_f(ids, seqs)

        if result_path:
            # if the user provided a result_path, write the
            # results to file with one tab-separated line per
            # cluster
            of = open(result_path, 'w')
            if sort_by == 'seq_id':

                def key(s):
                    try:
                        return int(s[1].split('_', 1)[-1])
                    except ValueError:
                        return s
            else:
                key = lambda s: s
            for cluster, id_ in sorted(result.items(), key=key):
                of.write('>%s %s\n%s\n' % (cluster, id_, seqs[id_]))
            of.close()
            result = None
            log_str = 'Result path: %s' % result_path
        else:
            # if the user did not provide a result_path, store
            # the result in a dict of {otu_id: rep_id},
            log_str = 'Result path: None, returned as dict.'

        if log_path:
            # if the user provided a log file path, log the run
            log_file = open(log_path, 'w')
            log_file.write(str(self))
            log_file.write('\n')
            log_file.write('%s\n' % log_str)

        # return the result (note this is None if the data was
        # written to file)
        return result
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    random_trials = opts.random_trials
    if random_trials != None and random_trials < 10:
        option_parser.error('Must perform >= 10 trails for Monte Carlo analysis.')
        
    output_dir = opts.output_dir
    sample_id_map_fp = opts.sample_id_map_fp
    num_dimensions = opts.num_dimensions
    
    if not exists(output_dir): 
        makedirs(output_dir)
    
    if opts.store_trial_details:
        trial_output_dir = '%s/trial_details/' % output_dir
    else:
        trial_output_dir = None
  
    input_fp1 = opts.input_fps[0]
    input_fp2 = opts.input_fps[1]
    input_fp1_dir, input_fn1 = split(input_fp1)
    input_fp1_basename, input_fp1_ext = splitext(input_fn1)
    input_fp2_dir, input_fn2 = split(input_fp2)
    input_fp2_basename, input_fp2_ext = splitext(input_fn2)
    output_summary_fp = '%s/%s_%s_procrustes_results.txt' %\
     (output_dir,input_fp1_basename,input_fp2_basename)
    output_matrix1_fp = '%s/pc1_transformed.txt' % output_dir
    output_matrix2_fp = '%s/pc2_transformed.txt' % output_dir
    
    if sample_id_map_fp:
        sample_id_map = dict([(k,v[0]) \
         for k,v in fields_to_dict(open(sample_id_map_fp, "U")).items()])
    else:
        sample_id_map = None
    
    transformed_coords1, transformed_coords2, m_squared, randomized_coords2 =\
      get_procrustes_results(open(input_fp1,'U'),\
                             open(input_fp2,'U'),\
                             sample_id_map=sample_id_map,\
                             randomize=False,
                             max_dimensions=num_dimensions)
    output_matrix1_f = open(output_matrix1_fp,'w')
    output_matrix1_f.write(transformed_coords1)
    output_matrix1_f.close()
    output_matrix2_f = open(output_matrix2_fp,'w')
    output_matrix2_f.write(transformed_coords2)
    output_matrix2_f.close()
    
    if random_trials:
        summary_file_lines = ['FP1 FP2 Included_dimensions MC_p_value Count_better M^2']
        coords_f1 = list(open(input_fp1,'U'))
        coords_f2 = list(open(input_fp2,'U'))
        actual_m_squared, trial_m_squareds, count_better, mc_p_value =\
         procrustes_monte_carlo(coords_f1,\
                                coords_f2,\
                                trials=random_trials,\
                                max_dimensions=num_dimensions,
                                sample_id_map=sample_id_map,
                                trial_output_dir=trial_output_dir)
        # truncate the p-value to the correct number of significant
        # digits
        mc_p_value_str = format_p_value_for_num_iters(mc_p_value, random_trials)
        max_dims_str = str(num_dimensions or 'alldim')
        summary_file_lines.append('%s %s %s %s %d %1.3f' %\
         (input_fp1, input_fp2, str(max_dims_str), mc_p_value_str,\
          count_better, actual_m_squared))
        f = open(output_summary_fp,'w')
        f.write('\n'.join(summary_file_lines))
        f.write('\n')
        f.close()
Example #33
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    input_fps = opts.input_fps
    sample_id_map_fps = opts.sample_id_map_fps
    num_dimensions = opts.num_dimensions
    max_dims_str = str(num_dimensions or 'alldim')
    output_dir = opts.output_dir
    random_trials = opts.random_trials

    if random_trials is not None and random_trials < 10:
        option_parser.error(
            'Must perform >= 10 trails for Monte Carlo analysis.')

    if sample_id_map_fps and \
       (len(sample_id_map_fps) + 1) != len(opts.input_fps):
        option_parser.error('If providing sample id maps, there must be '
                            'exactly one fewer sample id maps than input '
                            'coordinate matrices.')

    if not exists(output_dir):
        makedirs(output_dir)

    reference_input_fp = input_fps[0]
    reference_input_fp_dir, input_fn1 = split(reference_input_fp)
    reference_input_fp_basename, reference_input_fp_ext = splitext(input_fn1)
    output_summary_fp = join(output_dir, 'procrustes_results.txt')
    summary_file_lines = [
        '#FP1\tFP2\tNum included dimensions\tMonte Carlo '
        'p-value\tCount better\tM^2',
        '#Warning: p-values in this file are NOT currently '
        'adjusted for multiple comparisons.'
    ]

    for i, query_input_fp in enumerate(input_fps[1:]):
        query_input_fp_dir, query_input_fn = split(query_input_fp)
        query_input_fp_basename, query_input_fp_ext = splitext(query_input_fn)
        output_matrix1_fp = join(
            output_dir,
            '%s_transformed_reference.txt' % reference_input_fp_basename)
        output_matrix2_fp = join(
            output_dir,
            '%s_transformed_q%d.txt' % (query_input_fp_basename, i + 1))

        if sample_id_map_fps:
            with open(sample_id_map_fps[i], "U") as f:
                sample_id_map = dict([
                    (k, v[0]) for k, v in fields_to_dict(f).iteritems()
                ])
        else:
            sample_id_map = None

        with open(reference_input_fp, 'U') as ref_in_f:
            with open(query_input_fp, 'U') as query_in_f:
                transf_coords1, transf_coords2, m_squared, rand_coords2 =\
                    get_procrustes_results(ref_in_f, query_in_f,
                                           sample_id_map=sample_id_map,
                                           randomize=False,
                                           max_dimensions=num_dimensions)

        transf_coords1.write(output_matrix1_fp)
        transf_coords2.write(output_matrix2_fp)

        if random_trials:
            if opts.store_trial_details:
                trial_output_dir = join(output_dir, 'trial_details_%d' % i + 2)
            else:
                trial_output_dir = None
            coords_f1 = open(reference_input_fp, 'U')
            coords_f2 = open(query_input_fp, 'U')
            actual_m_squared, trial_m_squareds, count_better, mc_p_value =\
                procrustes_monte_carlo(coords_f1,
                                       coords_f2,
                                       trials=random_trials,
                                       max_dimensions=num_dimensions,
                                       sample_id_map=sample_id_map,
                                       trial_output_dir=trial_output_dir)
            # truncate the p-value to the correct number of significant
            # digits
            mc_p_value_str = format_p_value_for_num_iters(
                mc_p_value, random_trials)
            summary_file_lines.append(
                '%s\t%s\t%s\t%s\t%d\t%1.3f' %
                (reference_input_fp, query_input_fp, max_dims_str,
                 mc_p_value_str, count_better, actual_m_squared))
        else:
            summary_file_lines.append(
                '%s\t%s\t%s\tNA\tNA\t%1.3f' %
                (reference_input_fp, query_input_fp, max_dims_str, m_squared))
    # Write output summary
    with open(output_summary_fp, 'w') as f:
        f.write('\n'.join(summary_file_lines))
        f.write('\n')
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    input_fps = opts.input_fps
    sample_id_map_fps = opts.sample_id_map_fps
    num_dimensions = opts.num_dimensions
    max_dims_str = str(num_dimensions or 'alldim')
    output_dir = opts.output_dir
    random_trials = opts.random_trials
    
    if random_trials != None and random_trials < 10:
        option_parser.error('Must perform >= 10 trails for Monte Carlo analysis.')
    
    if sample_id_map_fps and \
       (len(sample_id_map_fps) + 1) != len(opts.input_fps):
       option_parser.error('If providing sample id maps, there must be exactly'
                           ' one fewer sample id maps than input coordinate'
                           ' matrices.')
    
    if not exists(output_dir): 
        makedirs(output_dir)
  
    reference_input_fp = input_fps[0]
    reference_input_fp_dir, input_fn1 = split(reference_input_fp)
    reference_input_fp_basename, reference_input_fp_ext = splitext(input_fn1)
    output_summary_fp = join(output_dir,'procrustes_results.txt')
    summary_file_lines = \
     ['#FP1\tFP2\tNum included dimensions\tMonte Carlo p-value\tCount better\tM^2',
      '#Warning: p-values in this file are NOT currently adjusted for multiple comparisons.']
    
    for i,query_input_fp in enumerate(input_fps[1:]):
        query_input_fp_dir, query_input_fn = split(query_input_fp)
        query_input_fp_basename, query_input_fp_ext = splitext(query_input_fn)
        output_matrix1_fp = join(output_dir,
         '%s_transformed_reference.txt' % reference_input_fp_basename)
        output_matrix2_fp = join(output_dir,\
         '%s_transformed_q%d.txt' % (query_input_fp_basename, i+1))
        
        if sample_id_map_fps:
            sample_id_map = dict([(k,v[0]) \
             for k,v in fields_to_dict(open(sample_id_map_fps[i], "U")).items()])
        else:
            sample_id_map = None
        
        transformed_coords1, transformed_coords2, m_squared, randomized_coords2 =\
          get_procrustes_results(open(reference_input_fp,'U'),\
                                 open(query_input_fp,'U'),\
                                 sample_id_map=sample_id_map,\
                                 randomize=False,
                                 max_dimensions=num_dimensions)
        
        output_matrix1_f = open(output_matrix1_fp,'w')
        output_matrix1_f.write(transformed_coords1)
        output_matrix1_f.close()
        output_matrix2_f = open(output_matrix2_fp,'w')
        output_matrix2_f.write(transformed_coords2)
        output_matrix2_f.close()
        
        if random_trials:
            if opts.store_trial_details:
                trial_output_dir = join(output_dir,'trial_details_%d' % i+2)
            else:
                trial_output_dir = None
            coords_f1 = list(open(reference_input_fp,'U'))
            coords_f2 = list(open(query_input_fp,'U'))
            actual_m_squared, trial_m_squareds, count_better, mc_p_value =\
             procrustes_monte_carlo(coords_f1,
                                    coords_f2,
                                    trials=random_trials,
                                    max_dimensions=num_dimensions,
                                    sample_id_map=sample_id_map,
                                    trial_output_dir=trial_output_dir)
            # truncate the p-value to the correct number of significant
            # digits
            mc_p_value_str = format_p_value_for_num_iters(mc_p_value, random_trials)
            summary_file_lines.append('%s\t%s\t%s\t%s\t%d\t%1.3f' %\
             (reference_input_fp, query_input_fp, max_dims_str, mc_p_value_str,\
              count_better, actual_m_squared))
        else:
            summary_file_lines.append('%s\t%s\t%s\tNA\tNA\t%1.3f' %\
             (reference_input_fp, query_input_fp, max_dims_str, m_squared))
    # Write output summary
    f = open(output_summary_fp,'w')
    f.write('\n'.join(summary_file_lines))
    f.write('\n')
    f.close()
Example #35
0
    def __call__(self, seq_path, otu_path, result_path=None, log_path=None,
                 sort_by='otu'):
        """Returns dict mapping {otu_id:[seq_ids]} for each otu.

        Parameters:
        seq_path: path to file of sequences
        otu_path: path to file of OTUs
        result_path: path to file of results. If specified,
        dumps the result to the desired path instead of returning it.
        log_path: path to log, which includes dump of params.
        sort_by: sort by otu or seq_id
        """
        # Load the seq path. We may want to change that in the future
        # to avoid the overhead of loading large sequence collections
        # during this step.
        seq_f = open(seq_path, 'U')
        seqs = dict(parse_fasta(seq_f, label_to_name=label_to_name))
        seq_f.close()

        # Load the otu file
        otu_f = open(otu_path, 'U')
        otus = fields_to_dict(otu_f)
        otu_f.close()

        if self.Params['ChoiceFRequiresSeqs']:
            choice_f = self.Params['ChoiceF'](seqs)
        else:
            choice_f = self.Params['ChoiceF']

        # actually pick the set
        result = {}
        for set_id, ids in otus.items():
            result[set_id] = choice_f(ids, seqs)

        if result_path:
            # if the user provided a result_path, write the
            # results to file with one tab-separated line per
            # cluster
            of = open(result_path, 'w')
            if sort_by == 'seq_id':
                def key(s):
                    try:
                        return int(s[1].split('_', 1)[-1])
                    except ValueError:
                        return s
            else:
                key = lambda s: s
            for cluster, id_ in sorted(result.items(), key=key):
                of.write('>%s %s\n%s\n' % (cluster, id_, seqs[id_]))
            of.close()
            result = None
            log_str = 'Result path: %s' % result_path
        else:
            # if the user did not provide a result_path, store
                # the result in a dict of {otu_id: rep_id},
            log_str = 'Result path: None, returned as dict.'

        if log_path:
            # if the user provided a log file path, log the run
            log_file = open(log_path, 'w')
            log_file.write(str(self))
            log_file.write('\n')
            log_file.write('%s\n' % log_str)

        # return the result (note this is None if the data was
        # written to file)
        return result
Example #36
0
    def __call__(self, seq_path, otu_path, reference_path,
                 result_path=None, log_path=None, sort_by='otu'):
        """Returns dict mapping {otu_id:[seq_ids]} for each otu.

        Parameters:
        seq_path: path to file of sequences
        otu_path: path to file of OTUs
        result_path: path to file of results. If specified,
        dumps the result to the desired path instead of returning it.
        log_path: path to log, which includes dump of params.
        sort_by: sort by otu or seq_id
        """
        # Load the seq path. We may want to change that in the future
        # to avoid the overhead of loading large sequence collections
        # during this step.
        if seq_path:
            seq_f = open(seq_path, 'U')
            seqs = dict(parse_fasta(seq_f, label_to_name=label_to_name))
            seq_f.close()
        else:
            # allows the user to not pass seqs, which can be useful when
            # all otus are based on reference sequences
            seqs = {}

        # Load the reference_path. We may want to change that in the future
        # to avoid the overhead of loading large sequence collections
        # during this step.
        reference_f = open(reference_path, 'U')
        reference_seqs = dict(
            parse_fasta(reference_f, label_to_name=label_to_name))
        reference_f.close()

        # Load the otu file
        otu_f = open(otu_path, 'U')
        otus = fields_to_dict(otu_f)
        otu_f.close()

        if self.Params['ChoiceFRequiresSeqs']:
            choice_f = self.Params['ChoiceF'](seqs)
        else:
            choice_f = self.Params['ChoiceF']

        # actually pick the set
        result = {}
        for set_id, ids in otus.items():
            if set_id in reference_seqs:
                result[set_id] = (reference_seqs, set_id)
            elif seqs:
                result[set_id] = (seqs, choice_f(ids, seqs))
            else:
                raise KeyError("Unknown reference sequence identifier: %s\n" % set_id +
                               "Have you provided the correct reference sequence file? " +
                               "Did you forget to provide a seqs filepath for de novo OTUs?")

        if result_path:
            of = open(result_path, 'w')
            if sort_by == 'seq_id':
                def key(s):
                    try:
                        return int(s[1].split('_', 1)[-1])
                    except ValueError:
                        return s
            else:
                key = lambda s: s
            for cluster, rep in sorted(result.items(), key=key):
                seq_lookup, id_ = rep
                try:
                    of.write('>%s %s\n%s\n' % (cluster, id_, seq_lookup[id_]))
                except KeyError:
                    raise KeyError("Sequence identifiers (%s and %s) " % (cluster, id_) +
                                   "not found in reference or sequence collection.")
            of.close()
            result = None
            log_str = 'Result path: %s' % result_path
        else:
            # The return value here differs from GenericRepSetPicker
            # because it is possible for the representative sequences
            # to be ambiguous. For example, if the identifiers in
            # seq_path and reference_path are both integers, returning
            # a sequence identifier is not sufficent to determine which
            # sequence collection the reference sequence came from.
            # Therefore if the user did not provide a result_path, store
            # the result in a dict of {otu_id: (rep_id, rep_seq)},
            log_str = 'Result path: None, returned as dict.'

            for cluster, rep in result.items():
                seq_lookup, id_ = rep
                try:
                    result[cluster] = (id_, seq_lookup[id_])
                except KeyError:
                    raise KeyError("Sequence identifiers (%s and %s) " % (cluster, id_) +
                                   "not found in reference or sequence collection.")

        if log_path:
            # if the user provided a log file path, log the run
            log_file = open(log_path, 'w')
            log_file.write(str(self))
            log_file.write('\n')
            log_file.write('%s\n' % log_str)

        # return the result (note this is None if the data was
        # written to file)
        return result
def compute_sequence_stats(fasta_lines, tax_map_lines, unknown_keywords=None):
    """Generates statistics for the input sequences.

    Returns a dictionary with sequence ID as the key, and a list of statistics
    as the value. The statistics (in order of placement in the list) are:
        relevant taxonomic depth (integer)
        sequence read length (integer)
        sequence data (string)
    
    For example, the relevant taxonomic depth for the taxonomy string 'A;B;C'
    would be 3. If unknown_keywords is supplied, any taxonomic level matching
    a value in unknown_keywords wil be ignored. Empty or whitespace-only
    taxonomic levels will also be ignored in the count. For example, if 'Z' is
    an unknown keyword, the relevant taxonomic depth of 'A;B;Z;C' will be 3.

    Arguments:
        fasta_lines - list of lines in FASTA format (the result of calling
            readlines() on the open file handle)
        tax_map_lines - list of lines from the taxonomy mapping file (the
            result of calling readlines() on the open file handle)
        unknown_keywords - a list of strings corresponding to taxonomic level
            strings that should be ignored when computing the relevant
            taxonomic depth
    """
    seq_stats = {}

    if tax_map_lines[0] != \
            "ID Number\tGenBank Number\tNew Taxon String\tSource\n":
        raise ValueError("The taxonomy map file appears to be invalid "
                         "because it is either missing the header or has a "
                         "corrupt header.")

    # Record the taxonomy depths for each sequence.
    for seq_id, seq_info in fields_to_dict(tax_map_lines[1:]).items():
        if len(seq_info) != 3:
            raise ValueError("The taxonomy map file appears to be invalid "
                             "because it does not have exactly 4 columns.")
        # Split at each level and remove any empty levels or levels that
        # contain only whitespace.
        taxonomy = [level for level in seq_info[1].split(';') \
                    if level.strip() != '']

        # Remove any 'unknown' taxonomy levels before computing the known
        # taxonomy depth.
        if unknown_keywords:
            for unknown_keyword in unknown_keywords:
                while unknown_keyword in taxonomy:
                    taxonomy.remove(unknown_keyword)
        seq_stats[seq_id] = [len(taxonomy)]

    # Record the sequence data and sequence length for each sequence.
    for seq_id, seq in MinimalFastaParser(fasta_lines):
        if seq_id in seq_stats:
            seq_stats[seq_id].extend([len(seq), seq])
        else:
            print ("Found sequence id '%s' in the FASTA file that wasn't in "
                   "the taxonomy mapping file\n" % seq_id)
            # Assign a taxonomic depth of 0 because we don't have any
            # taxonomic information for the sequence.
            seq_stats[seq_id] = [0, len(seq), seq]
    return seq_stats