Example #1
0
def reciprocal_blast(good_proteins_fasta, fasta_files):
    """Create blast database for good_proteins_fasta, blast all fasta_files against this database & return hits."""
    # Create blast database, retrieve path & name
    database_url = _create_blast_database(good_proteins_fasta)

    # Submit job for each fasta files, and store jobid & hits file name tuples
    jobids_and_hits_filenames = [
        _submit_blast_run(database_url, fasta_file)
        for fasta_file in fasta_files
    ]

    # Retrieve all results, which should only take neglishably longer than waiting for the slowest results
    x_vs_all_hits = [
        _retrieve_blast_hits(jobid, hits_file)
        for jobid, hits_file in jobids_and_hits_filenames
    ]

    # Concatenate the individual blast result files into one
    allvsall = tempfile.mkstemp(suffix='.tsv', prefix='all_vs_all')[1]
    concatenate(allvsall, x_vs_all_hits)

    # Clean up local and remote
    for x_vs_all in x_vs_all_hits:
        os.remove(x_vs_all)
    #Remote database_url port :444 requires certificate authentication, which we remove to use the basic authentication
    database_url = database_url.replace(':444', '')
    send_request(database_url, method='DELETE')

    return allvsall
def reciprocal_blast(good_proteins_fasta, fasta_files):
    """Create blast database for good_proteins_fasta, blast all fasta_files against this database & return hits."""
    run_dir = tempfile.mkdtemp(prefix='reciprocal_blast_')

    # Create blast database, retrieve path & name
    db_dir, db_name = _create_blast_database(run_dir, good_proteins_fasta)

    # Blast individual fasta files against the made blast databank, instead of the much larger good_proteins_fasta
    x_vs_all_hits = [_blast_file_against_database(db_dir, db_name, fasta) for fasta in fasta_files]

    # Concatenate the individual blast result files into one
    allvsall = tempfile.mkstemp(suffix='.tsv', prefix='all-vs-all_')[1]
    concatenate(allvsall, x_vs_all_hits)

    # Clean up
    shutil.rmtree(run_dir)

    return allvsall
Example #3
0
def _translate_genome(tuples_of_gbk_and_ptt_files):
    """Translate all files for genome and concatenate them into single DNA and Protein fasta files."""
    assert tuples_of_gbk_and_ptt_files is not None, 'No genbank files were provided'

    project_id = tuples_of_gbk_and_ptt_files[0][0]
    out_dir = create_directory('translations/' + project_id)
    dna_files = []
    protein_files = []
    for project_id, gbk_file, ptt_file in tuples_of_gbk_and_ptt_files:
        dna_file, protein_file = _extract_gene_and_protein(out_dir, project_id, gbk_file, ptt_file)
        dna_files.append(dna_file)
        protein_files.append(protein_file)

    #Concatenate files into one
    dna_concatemer = os.path.join(out_dir, '{pid}.ffn'.format(pid=project_id))
    protein_concatemer = os.path.join(out_dir, '{pid}.faa'.format(pid=project_id))
    concatenate(dna_concatemer, dna_files)
    concatenate(protein_concatemer, protein_files)
    return dna_concatemer, protein_concatemer
Example #4
0
def reciprocal_blast(good_proteins_fasta, fasta_files):
    """Create blast database for good_proteins_fasta, blast all fasta_files against this database & return hits."""
    run_dir = tempfile.mkdtemp(prefix='reciprocal_blast_')

    # Create blast database, retrieve path & name
    db_dir, db_name = _create_blast_database(run_dir, good_proteins_fasta)

    # Blast individual fasta files against the made blast databank, instead of the much larger good_proteins_fasta
    x_vs_all_hits = [
        _blast_file_against_database(db_dir, db_name, fasta)
        for fasta in fasta_files
    ]

    # Concatenate the individual blast result files into one
    allvsall = tempfile.mkstemp(suffix='.tsv', prefix='all-vs-all_')[1]
    concatenate(allvsall, x_vs_all_hits)

    # Clean up
    shutil.rmtree(run_dir)

    return allvsall
Example #5
0
def _translate_genome(tuples_of_gbk_and_ptt_files):
    """Translate all files for genome and concatenate them into single DNA and Protein fasta files."""
    assert tuples_of_gbk_and_ptt_files is not None, 'No genbank files were provided'

    project_id = tuples_of_gbk_and_ptt_files[0][0]
    out_dir = create_directory('translations/' + project_id)
    dna_files = []
    protein_files = []
    for project_id, gbk_file, ptt_file in tuples_of_gbk_and_ptt_files:
        dna_file, protein_file = _extract_gene_and_protein(
            out_dir, project_id, gbk_file, ptt_file)
        dna_files.append(dna_file)
        protein_files.append(protein_file)

    #Concatenate files into one
    dna_concatemer = os.path.join(out_dir, '{pid}.ffn'.format(pid=project_id))
    protein_concatemer = os.path.join(out_dir,
                                      '{pid}.faa'.format(pid=project_id))
    concatenate(dna_concatemer, dna_files)
    concatenate(protein_concatemer, protein_files)
    return dna_concatemer, protein_concatemer
def reciprocal_blast(good_proteins_fasta, fasta_files):
    """Create blast database for good_proteins_fasta, blast all fasta_files against this database & return hits."""
    # Create blast database, retrieve path & name
    database_url = _create_blast_database(good_proteins_fasta)

    # Submit job for each fasta files, and store jobid & hits file name tuples
    jobids_and_hits_filenames = [_submit_blast_run(database_url, fasta_file) for fasta_file in fasta_files]

    # Retrieve all results, which should only take neglishably longer than waiting for the slowest results
    x_vs_all_hits = [_retrieve_blast_hits(jobid, hits_file) for jobid, hits_file in jobids_and_hits_filenames]

    # Concatenate the individual blast result files into one
    allvsall = tempfile.mkstemp(suffix=".tsv", prefix="all_vs_all")[1]
    concatenate(allvsall, x_vs_all_hits)

    # Clean up local and remote
    for x_vs_all in x_vs_all_hits:
        os.remove(x_vs_all)
    # Remote database_url port :444 requires certificate authentication, which we remove to use the basic authentication
    database_url = database_url.replace(":444", "")
    send_request(database_url, method="DELETE")

    return allvsall
Example #7
0
def calculate_tables(genome_ids_a, genome_ids_b, sico_files, oddeven=False):
    """Compute a spreadsheet of data points each for A and B based the SICO files, without duplicating computations."""
    #Convert file names into identifiers while preserving filenames, as filenames are used both for BioPython & PhiPack
    orth_files = [(os.path.split(sico_file)[1].split('.')[0], sico_file)
                  for sico_file in sico_files]

    #Find PhiPack values for each sico file
    orth_phipack_values = _phipack_values_for_sicos(orth_files)

    #Convert list of sico files into ortholog name mapped to BioPython Alignment object
    sico_alignments = [(ortholog, AlignIO.read(sico_file, 'fasta'))
                       for ortholog, sico_file in orth_files]

    #Only retrieve genomes once which we'll use to link gene names to orthologs
    all_genome_ids = list(genome_ids_a)
    all_genome_ids.extend(genome_ids_b)
    genomes = select_genomes_by_ids(all_genome_ids).values()

    #For each ortholog, determine the newest gene name across taxa so unannotated taxa also get gene names
    ortholog_gene_names = dict(
        (ortholog, get_most_recent_gene_name(genomes, alignmnt))
        for ortholog, alignmnt in sico_alignments)

    #Split individual sico alignments into separate alignments for each of the clades per ortholog
    #These split alignments can later be reversed and/or subselections can be made to calculate for alternate alignments
    split_alignments = [
        (ortholog,
         MultipleSeqAlignment(seqr for seqr in alignmnt
                              if seqr.id.split('|')[0] in genome_ids_a),
         MultipleSeqAlignment(seqr for seqr in alignmnt
                              if seqr.id.split('|')[0] in genome_ids_b))
        for ortholog, alignmnt in sico_alignments
    ]

    #Calculate tables for normal sico alignments
    log.info('Starting calculations for full alignments')
    table_a, table_b = _tables_for_split_alignments(split_alignments,
                                                    ortholog_gene_names,
                                                    orth_phipack_values)

    if not oddeven:
        return table_a, table_b

    #As an alternate method of calculating number of substitutions for independent X-axis of eventual graph:
    #split each alignment for a and b into two further alignments of odd and even codons
    odd_even_split_orth_alignments = [
        (orthologname, _every_other_codon_alignments(alignment_x),
         _every_other_codon_alignments(alignment_y))
        for orthologname, alignment_x, alignment_y in split_alignments
    ]

    #Recover odd alignments as first from each pair of alignments
    odd_split_alignments = [(orthologname, odd_even_x[0], odd_even_y[0])
                            for orthologname, odd_even_x, odd_even_y in
                            odd_even_split_orth_alignments]

    #Create files for all the odd codon alignments, so we can run PhiPack for them
    odd_alignments_dir = tempfile.mkdtemp(prefix='odd_codon_alignments_')
    odd_files = dict(
        (ortholog, os.path.join(odd_alignments_dir, ortholog + '.ffn'))
        for ortholog, odd_x, odd_y in odd_split_alignments)
    for ortholog, odd_x, odd_y in odd_split_alignments:
        AlignIO.write([odd_x, odd_y], odd_files[ortholog], 'fasta')
    odd_phipack_vals = _phipack_values_for_sicos(odd_files.items())
    shutil.rmtree(odd_alignments_dir)

    #Calculate tables for odd codon sico alignments
    log.info('Starting calculations for odd alignments')
    table_a_odd, table_b_odd = _tables_for_split_alignments(
        odd_split_alignments, ortholog_gene_names, odd_phipack_vals)

    #Recover even alignments as second from each pair of alignments
    even_split_alignments = [(orthologname, odd_even_x[1], odd_even_y[1])
                             for orthologname, odd_even_x, odd_even_y in
                             odd_even_split_orth_alignments]

    #Create files for all the odd codon alignments, so we can run PhiPack for them
    even_alignments_dir = tempfile.mkdtemp(prefix='even_codon_alignments_')
    even_files = dict(
        (ortholog, os.path.join(even_alignments_dir, ortholog + '.ffn'))
        for ortholog, even_x, even_y in even_split_alignments)
    for ortholog, even_x, even_y in even_split_alignments:
        AlignIO.write([even_x, even_y], even_files[ortholog], 'fasta')
    even_phipack_vals = _phipack_values_for_sicos(even_files.items())
    shutil.rmtree(even_alignments_dir)

    #Calculate tables for even codon sico alignments
    log.info('Starting calculations for even alignments')
    table_a_even, table_b_even = _tables_for_split_alignments(
        even_split_alignments, ortholog_gene_names, even_phipack_vals)

    #Concatenate tables and return their values
    table_a_full = tempfile.mkstemp(suffix='.tsv', prefix='table_a_full_')[1]
    table_b_full = tempfile.mkstemp(suffix='.tsv', prefix='table_b_full_')[1]
    concatenate(table_a_full, [table_a, table_a_odd, table_a_even])
    concatenate(table_b_full, [table_b, table_b_odd, table_b_even])
    return table_a_full, table_b_full
Example #8
0
def calculate_tables(genome_ids_a, genome_ids_b, sico_files, oddeven=False):
    """Compute a spreadsheet of data points each for A and B based the SICO files, without duplicating computations."""
    #Convert file names into identifiers while preserving filenames, as filenames are used both for BioPython & PhiPack
    orth_files = [(os.path.split(sico_file)[1].split('.')[0], sico_file) for sico_file in sico_files]

    #Find PhiPack values for each sico file
    orth_phipack_values = _phipack_values_for_sicos(orth_files)

    #Convert list of sico files into ortholog name mapped to BioPython Alignment object
    sico_alignments = [(ortholog, AlignIO.read(sico_file, 'fasta'))
                       for ortholog, sico_file in orth_files]

    #Only retrieve genomes once which we'll use to link gene names to orthologs
    all_genome_ids = list(genome_ids_a)
    all_genome_ids.extend(genome_ids_b)
    genomes = select_genomes_by_ids(all_genome_ids).values()

    #For each ortholog, determine the newest gene name across taxa so unannotated taxa also get gene names
    ortholog_gene_names = dict((ortholog, get_most_recent_gene_name(genomes, alignmnt))
                               for ortholog, alignmnt in sico_alignments)

    #Split individual sico alignments into separate alignments for each of the clades per ortholog
    #These split alignments can later be reversed and/or subselections can be made to calculate for alternate alignments
    split_alignments = [(ortholog,
                         MultipleSeqAlignment(seqr for seqr in alignmnt if seqr.id.split('|')[0] in genome_ids_a),
                         MultipleSeqAlignment(seqr for seqr in alignmnt if seqr.id.split('|')[0] in genome_ids_b))
                        for ortholog, alignmnt in sico_alignments]

    #Calculate tables for normal sico alignments
    log.info('Starting calculations for full alignments')
    table_a, table_b = _tables_for_split_alignments(split_alignments, ortholog_gene_names, orth_phipack_values)

    if not oddeven:
        return table_a, table_b

    #As an alternate method of calculating number of substitutions for independent X-axis of eventual graph:
    #split each alignment for a and b into two further alignments of odd and even codons
    odd_even_split_orth_alignments = [(orthologname,
                                      _every_other_codon_alignments(alignment_x),
                                      _every_other_codon_alignments(alignment_y))
                                      for orthologname, alignment_x, alignment_y in split_alignments]

    #Recover odd alignments as first from each pair of alignments
    odd_split_alignments = [(orthologname,
                            odd_even_x[0],
                            odd_even_y[0])
                            for orthologname, odd_even_x, odd_even_y in odd_even_split_orth_alignments]

    #Create files for all the odd codon alignments, so we can run PhiPack for them
    odd_alignments_dir = tempfile.mkdtemp(prefix='odd_codon_alignments_')
    odd_files = dict((ortholog, os.path.join(odd_alignments_dir, ortholog + '.ffn'))
                     for ortholog, odd_x, odd_y in odd_split_alignments)
    for ortholog, odd_x, odd_y in odd_split_alignments:
        AlignIO.write([odd_x, odd_y], odd_files[ortholog], 'fasta')
    odd_phipack_vals = _phipack_values_for_sicos(odd_files.items())
    shutil.rmtree(odd_alignments_dir)

    #Calculate tables for odd codon sico alignments
    log.info('Starting calculations for odd alignments')
    table_a_odd, table_b_odd = _tables_for_split_alignments(odd_split_alignments, ortholog_gene_names, odd_phipack_vals)

    #Recover even alignments as second from each pair of alignments
    even_split_alignments = [(orthologname,
                            odd_even_x[1],
                            odd_even_y[1])
                            for orthologname, odd_even_x, odd_even_y in odd_even_split_orth_alignments]

    #Create files for all the odd codon alignments, so we can run PhiPack for them
    even_alignments_dir = tempfile.mkdtemp(prefix='even_codon_alignments_')
    even_files = dict((ortholog, os.path.join(even_alignments_dir, ortholog + '.ffn'))
                     for ortholog, even_x, even_y in even_split_alignments)
    for ortholog, even_x, even_y in even_split_alignments:
        AlignIO.write([even_x, even_y], even_files[ortholog], 'fasta')
    even_phipack_vals = _phipack_values_for_sicos(even_files.items())
    shutil.rmtree(even_alignments_dir)

    #Calculate tables for even codon sico alignments
    log.info('Starting calculations for even alignments')
    table_a_even, table_b_even = _tables_for_split_alignments(even_split_alignments,
                                                              ortholog_gene_names,
                                                              even_phipack_vals)

    #Concatenate tables and return their values
    table_a_full = tempfile.mkstemp(suffix='.tsv', prefix='table_a_full_')[1]
    table_b_full = tempfile.mkstemp(suffix='.tsv', prefix='table_b_full_')[1]
    concatenate(table_a_full, [table_a, table_a_odd, table_a_even])
    concatenate(table_b_full, [table_b, table_b_odd, table_b_even])
    return table_a_full, table_b_full