def filter_similar_plasmids(plasmid_scores, output_dir):
    """
    Fairly frequently the plasmids recovered end up being close to identical.
    This method sorts through the plasmids to find which plasmids are very similar and picks the best one.
    :param plasmid_scores: Dictionary generated by find_plasmid_kmer_scores.
    :param output_dir: Directory to put temporary files.
    :return: List of plasmids.
    """
    # If only one plasmid, just return that.
    if len(plasmid_scores) == 1:
        return list(plasmid_scores.keys())
    # Otherwise, we create a distance matrix using mash.
    mash_results = list()
    i = 0
    for query_plasmid in plasmid_scores:
        mash_results.append(list())
        for reference_plasmid in plasmid_scores:
            mash.dist(query_plasmid,
                      reference_plasmid,
                      output_file=os.path.join(output_dir, 'distances.tab'))
            result = mash.read_mash_output(
                os.path.join(output_dir, 'distances.tab'))
            mash_results[i].append(result)
        i += 1
    matrix = list()
    iteration = 1
    for result in mash_results:
        j = 1
        for item in result:
            if j > iteration:
                matrix.append(item[0].distance)
            j += 1
        iteration += 1

    # Once the distance matrix has been made, feed it into SciPy to do clustering.
    z = cluster.hierarchy.linkage(matrix, method='average')
    clustering = cluster.hierarchy.fcluster(z, 0.05, criterion='distance')
    num_clusters = max(clustering)
    clusters = list()
    # Create our clusters.
    plasmids_to_use = list()
    for i in range(num_clusters):
        clusters.append(list())
    plasmid_names = list(plasmid_scores.keys())
    for i in range(len(clustering)):
        clusters[clustering[i] - 1].append(plasmid_names[i])
    # Iterate through clusters, and use the highest scoring plasmid from each cluster for further analysis.
    for group in clusters:
        max_score = 0
        best_hit = ''
        for strain in group:
            if plasmid_scores[strain] > max_score:
                best_hit = strain
                max_score = plasmid_scores[strain]
        plasmids_to_use.append(best_hit)
    return plasmids_to_use
Exemple #2
0
def check_distances(ref_fasta, fastq_folder, work_dir):
    bad_fastqs = list()
    # fastqs = glob.glob(os.path.join(fastq_folder, '*R1*'))
    mash.sketch(os.path.join(fastq_folder, '*R1*'), output_sketch=os.path.join(work_dir, 'sketch.msh'), threads=5)
    mash.dist(os.path.join(work_dir, 'sketch.msh'), ref_fasta, threads=48,
              output_file=os.path.join(work_dir, 'distances.tab'))
    mash_output = mash.read_mash_output(os.path.join(work_dir, 'distances.tab'))
    for item in mash_output:
        if item.distance > 0.15:  # Moved value from 0.06 to 0.15 - was definitely too conservative before.
            bad_fastqs.append(item.reference)
    return bad_fastqs
Exemple #3
0
def check_distances(ref_fasta, fasta_folder):
    bad_fastqs = list()
    # fastqs = glob.glob(os.path.join(fastq_folder, '*R1*'))
    mash.sketch(os.path.join(fasta_folder, '*.fasta'), output_sketch=os.path.join(fasta_folder, 'sketch.msh'), threads=56)
    mash.dist(os.path.join(fasta_folder, 'sketch.msh'), ref_fasta, threads=56, output_file=os.path.join(fasta_folder, 'distances.tab'))
    mash_output = mash.read_mash_output(os.path.join(fasta_folder, 'distances.tab'))
    for item in mash_output:
        print(item.reference, item.query, str(item.distance))
        if item.distance > 0.06:  # May need to adjust this value.
            bad_fastqs.append(item.reference)
    return bad_fastqs
Exemple #4
0
def find_closest_refseq_genome(forward_reads, refseq_sketch, outdir):
    print('FINDING CLOSEST REFSEQ GENOME')
    out_file = os.path.join(outdir, 'distances.tab')
    mash.dist(forward_reads, refseq_sketch, output_file=out_file)
    closest_genome = 'NA'
    closest_distance = 10000
    mash_results = mash.read_mash_output(result_file=out_file)
    for result in mash_results:
        if result.distance < closest_distance:
            print(result.distance)
            print(result.query)
            closest_genome = result.query
            closest_distance = result.distance
    return closest_genome.replace('_genomic.fna', '')
Exemple #5
0
def test_mash_dist_call_kwargs():
    out, err, cmd = mash.dist('tests/dummy_fastq/*fastq',
                              output_file='tests/distances.tab',
                              returncmd=True,
                              s='34')
    assert cmd == 'mash dist tests/dummy_fastq/*fastq  -p 1  -s 34 > tests/distances.tab'
    os.remove('tests/distances.tab')
Exemple #6
0
def test_mash_dist_call_multithreaded():
    out, err, cmd = mash.dist('tests/dummy_fastq/*fastq',
                              output_file='tests/distances.tab',
                              returncmd=True,
                              threads=4)
    assert cmd == 'mash dist tests/dummy_fastq/*fastq  -p 4  > tests/distances.tab'
    os.remove('tests/distances.tab')
Exemple #7
0
def test_read_mash_dist():
    out, err, cmd = mash.dist('tests/dummy_fastq/*fastq',
                              output_file='tests/distances.tab',
                              returncmd=True)
    results = mash.read_mash_output('tests/distances.tab')
    assert results[1].reference == 'tests/dummy_fastq/single.fastq' \
        and results[1].query == 'tests/dummy_fastq/test_R2.fastq' \
        and results[1].distance == 0.00763536
    os.remove('tests/distances.tab')
Exemple #8
0
def closerelatives_redmine(redmine_instance, issue, work_dir, description):
    sentry_sdk.init(SENTRY_DSN, before_send=before_send)
    # Unpickle Redmine objects
    redmine_instance = pickle.load(open(redmine_instance, 'rb'))
    issue = pickle.load(open(issue, 'rb'))
    description = pickle.load(open(description, 'rb'))

    try:
        # First line of description should be number of close relatives desired.
        try:
            num_close_relatives = int(description[0])
        except ValueError:
            redmine_instance.issue.update(resource_id=issue.id,
                                          notes='Error! The first line of the description must be the number'
                                                ' of strains you want to find. The first line of your '
                                                'description was: {}'.format(description[0]),
                                          status_id=4)
            return

        # Second line of description should be the SEQID of what you want to find a close reference for.
        seqid = description[1]

        # Try to extract FASTA files for the specified SEQID.
        retrieve_nas_files(seqids=[seqid],
                           outdir=os.path.join(work_dir, 'fasta'),
                           filetype='fasta',
                           copyflag=False)
        if len(glob.glob(os.path.join(work_dir, 'fasta', '*.fasta'))) != 1:
            redmine_instance.issue.update(resource_id=issue.id,
                                          notes='Error! Could not find FASTA file for the specified SEQID. The SEQID'
                                                ' that you specified was: {}'.format(seqid),
                                          status_id=4)
            return

        # Run mash dist with the FASTQ file specified against the sketch of all our stuff.
        query_fasta = glob.glob(os.path.join(work_dir, 'fasta', '*.fasta'))[0]
        mash.dist(query_fasta, '/mnt/nas2/redmine/bio_requests/14674/all_sequences.msh',
                  threads=8, output_file=os.path.join(work_dir, 'distances.tab'))
        mash_results = mash.read_mash_output(os.path.join(work_dir, 'distances.tab'))
        result_dict = dict()
        # Put all the results into a dictionary, where the key is the sequence file and the value is mash distance
        # between query fastq and reference fastq.
        for item in mash_results:
            seq_name = os.path.split(item.query)[-1].split('_')[0]
            result_dict[seq_name] = item.distance

        # Sort the results, store the sorted dictionary keys in a list.
        sorted_distance_results = sorted(result_dict, key=result_dict.get)

        # Prepare a string that lists the top hit SEQIDs to be posted to redmine.
        upload_string = ''
        for i in range(num_close_relatives):
            upload_string = upload_string + sorted_distance_results[i].replace('.fasta', '') + ' (' + str(result_dict[sorted_distance_results[i]]) + ')\n'

        # Also make a CSV file of all results, in case someone wants to take a closer look.
        with open(os.path.join(work_dir, 'close_relatives_results.csv'), 'w') as f:
            f.write('Strain,MashDistance\n')
            for seq in sorted_distance_results:
                f.write('{},{}\n'.format(seq.replace('.fasta', ''), result_dict[seq]))

        output_list = [
            {
                'path': os.path.join(work_dir, 'close_relatives_results.csv'),
                'filename': 'close_relatives_results.csv'
            }
        ]
        # Post the list of closely related SEQIDs to redmine, as well as the CSV result file.
        redmine_instance.issue.update(resource_id=issue.id,
                                      notes='Process complete! Here is the list of the {num_relatives} closest strains '
                                            'to {query_strain} (mash distance between query and result in brackets):'
                                            '\n{upload_string}'.format(num_relatives=str(num_close_relatives),
                                                                       query_strain=seqid,
                                                                       upload_string=upload_string),
                                      status_id=4,
                                      uploads=output_list)

    except Exception as e:
        sentry_sdk.capture_exception(e)
        redmine_instance.issue.update(resource_id=issue.id,
                                      notes='Something went wrong! We log this automatically and will look into the '
                                            'problem and get back to you with a fix soon.')
Exemple #9
0
def test_dist_no_input_files():
    with pytest.raises(ValueError):
        mash.dist()