Beispiel #1
0
def indexSequenceBowtie2(referenceFile, threads):
    if os.path.isfile(str(referenceFile + '.1.bt2')):
        run_successfully = True
    else:
        command = ['bowtie2-build', '--threads', str(threads), referenceFile, referenceFile]
        run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True)
    return run_successfully
Beispiel #2
0
def run_bowtie(fastq_files, referenceFile, threads, outdir, conserved_True, numMapLoc):
    sam_file = os.path.join(outdir, str('alignment.sam'))

    run_successfully = indexSequenceBowtie2(referenceFile, threads)
    if run_successfully:
        command = ['bowtie2', '-k', str(numMapLoc), '-q', '', '--threads', str(threads), '-x', referenceFile, '',
                   '--no-unal', '-S', sam_file]

        if len(fastq_files) == 1:
            command[9] = '-U ' + fastq_files[0]
        elif len(fastq_files) == 2:
            command[9] = '-1 ' + fastq_files[0] + ' -2 ' + fastq_files[1]
        else:
            return False, None

        if conserved_True:
            command[4] = '--sensitive'
        else:
            command[4] = '--very-sensitive-local'

        run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True)

    if not run_successfully:
        sam_file = None

    return run_successfully, sam_file
Beispiel #3
0
def include_rematch_dependencies_path():
    original_rematch = None
    command = ['which', 'rematch.py']
    run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(
        command, False, None, False)
    if run_successfully:
        original_rematch = stdout.splitlines()[0]

    resource_rematch = None
    try:
        resource_rematch = resource_filename('ReMatCh', 'rematch.py')
    except ModuleNotFoundError:
        resource_rematch = original_rematch
    else:
        print('\n'
              'Using ReMatCh "{resource_rematch}" via "{original_rematch}"\n'.
              format(resource_rematch=resource_rematch,
                     original_rematch=original_rematch))

    if resource_rematch is not None:
        utils.setPATHvariable(False, resource_rematch)
    else:
        sys.exit('ReMatCh not found in the PATH')

    return resource_rematch
Beispiel #4
0
def run_bowtie_inspect(index_without_sufix, outdir):
    """
    Create the reference fasta file from Bowtie2 index files

    Parameters
    ----------
    index_without_sufix : str
        Path to the basename of the index. The basename is name of any of the index files but with the .X.bt2 or
        .rev.X.bt2 suffix omitted.
    outdir : str
        Path to output directory where the fasta file will be stored

    Returns
    -------
    run_successfully : bool
        Tells if the reference fasta file was successfully created
    reference_file : str
        Path to the newly created reference fasta file
    """

    reference_file = os.path.join(outdir,
                                  os.path.basename(index_without_sufix))
    run_successfully, _, _ = utils.runCommandPopenCommunicate([
        'bowtie2-inspect', '--across', 80, index_without_sufix, '>',
        reference_file
    ], True, None, True)
    return run_successfully, reference_file
def include_rematch_dependencies_path():
    command = ['which', 'rematch.py']
    run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(
        command, False, None, False)
    if run_successfully:
        rematch = stdout.splitlines()[0]
        utils.setPATHvariable(False, rematch)
    return rematch
Beispiel #6
0
def sortAlignment(alignment_file, output_file, sortByName_True, threads):
    outFormat_string = os.path.splitext(output_file)[1][1:].lower()
    command = ['samtools', 'sort', '-o', output_file, '-O', outFormat_string, '', '-@', str(threads), alignment_file]
    if sortByName_True:
        command[6] = '-n'
    run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True)
    if not run_successfully:
        output_file = None
    return run_successfully, output_file
def split_bam(bam_file, list_sequences, outdir, threads):
    new_bam = os.path.join(outdir, 'partial.bam')
    command = [
        'samtools', 'view', '-b', '-u', '-h', '-o', new_bam, '-@',
        str(threads), bam_file, ' '.join(list_sequences)
    ]
    run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(
        command, False, None, True)
    return run_successfully, new_bam
Beispiel #8
0
def include_rematch_dependencies_path(doNotUseProvidedSoftware):
    command = ['which', 'rematch.py']
    run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, False)
    if run_successfully:
        rematch = stdout.splitlines()[0]
        path_variable = os.environ['PATH']
        script_folder = os.path.dirname(rematch)
        if not doNotUseProvidedSoftware:
            bcftools = os.path.join(script_folder, 'src', 'bcftools-1.3.1', 'bin')
            os.environ['PATH'] = str(':'.join([bcftools, path_variable]))
Beispiel #9
0
def index_fasta_samtools(fasta, region_None, region_outfile_none, print_comand_True):
    command = ['samtools', 'faidx', fasta, '', '', '']
    shell_true = False
    if region_None is not None:
        command[3] = region_None
    if region_outfile_none is not None:
        command[4] = '>'
        command[5] = region_outfile_none
        shell_true = True
    run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, shell_true, None, print_comand_True)
    return run_successfully, stdout
Beispiel #10
0
def include_rematch_dependencies_path(doNotUseProvidedSoftware):
    command = ['which', 'rematch.py']
    run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(
        command, False, None, False)
    if run_successfully:
        rematch = stdout.splitlines()[0]
        path_variable = os.environ['PATH']
        script_folder = os.path.dirname(rematch)
        if not doNotUseProvidedSoftware:
            bcftools = os.path.join(script_folder, 'src', 'bcftools-1.3.1',
                                    'bin')
            os.environ['PATH'] = str(':'.join([bcftools, path_variable]))
Beispiel #11
0
def run_blast_command(query_file, blast_db, db_type, blast_output, threads=1):
    """
    Run Blast: blastn or blastp

    Parameters
    ----------
    query_file : str
        Path to fasta file containing the query sequences, e.g. /input/queries.fasta
    blast_db : str
        Path to Blast DB files, e.g. /input/blast_db.nucl.sequences.fasta
    db_type : str
        Blast DB type. Can only be 'nucl' or 'prot'
    blast_output : str
        Path to Blast output tabular file
    threads : int
        Number of CPUs to use during Blast

    Returns
    -------
    run_successfully : bool
        Tells if the Blast DB was successfully created
    """

    # Check Blast DB type
    if db_type not in ('nucl', 'prot'):
        exit('Wrong Blast DB type provided ({db_type}).'
             ' Use one of the following: "nucl", "prot"'.format(
                 db_type=db_type))

    # Check if db_output dir exists
    if not os.path.isdir(os.path.dirname(blast_output)):
        os.makedirs(os.path.dirname(blast_output))

    command = [
        '', '-query', query_file, '-db', blast_db, '-out', blast_output,
        '-outfmt', '', '-dust', 'no', '-culling_limit', '1', '-num_threads',
        str(threads)
    ]

    if db_type == 'nucl':
        command[0] = 'blastn -task blastn'
    else:
        command[0] = 'blastp -task blastp'

    command[
        8] = "'7 qseqid qlen sseqid slen qstart qend sstart send evalue length pident nident mismatch gaps'"

    run_successfully, _, _ = utils.runCommandPopenCommunicate(
        command, False, None, True)

    return run_successfully
Beispiel #12
0
def run_bowtie_build(reference_file, outdir, threads=1):
    """
    Create Bowtie2 index files

    Parameters
    ----------
    reference_file : str
        Path to the reference sequences file
    outdir : str
        Path to output directory where the fasta file will be stored
    threads : int, default 1
        Number of CPUs to use while creating Bowtie2 index
    Returns
    -------
    run_successfully : bool
        Tells if the reference fasta file was successfully created
    """

    index_basename = os.path.join(outdir, os.path.basename(reference_file))
    run_successfully, _, _ = utils.runCommandPopenCommunicate([
        'bowtie2-build', '--threads',
        str(threads), reference_file, index_basename
    ], False, None, True)
    return run_successfully
Beispiel #13
0
def indexAlignment(alignment_file):
    command = ['samtools', 'index', alignment_file]
    run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True)
    return run_successfully
Beispiel #14
0
def create_blast_db(db_sequences, db_output, db_type):
    """
    Creates a Blast DB

    Parameters
    ----------
    db_sequences : str
        Path to fasta file containing the sequences from which Blast DB will be created, e.g. /input/sequences.fasta
    db_output : str
        Path to Blast output DB files, e.g. /output/blast_db.nucl.sequences.fasta
    db_type : str
        Blast DB type. Can only be 'nucl' or 'prot'

    Returns
    -------
    run_successfully : bool
        Tells if the Blast DB was successfully created
    """

    run_successfully = False

    # Check Blast DB type
    if db_type not in ('nucl', 'prot'):
        exit(
            'Wrong Blast DB type provided ({db_type}). Use one of the following: nucl, prot'
            .format(db_type=db_type))

    # Check if db_output dir exists
    if not os.path.isdir(os.path.dirname(db_output)):
        os.makedirs(os.path.dirname(db_output))
    else:
        db_exists, original_file = check_db_exists(db_output)
        if db_exists and original_file:
            print('Blast DB already found at {db_output}'
                  ' for {db_sequences}'.format(
                      db_output=os.path.dirname(db_output),
                      db_sequences=db_sequences,
                      file_found=os.path.basename(db_sequences)))

            run_successfully = True
        elif db_exists and not original_file:
            print(
                'The original fasta file ({db_sequences}) from which the Blast DB was produced is not present. Make'
                ' sure it is found in {db_dir} and it is'
                ' named {original_file_name}'.format(
                    db_dir=os.path.dirname(db_output),
                    db_sequences=db_sequences,
                    original_file_name=os.path.basename(db_output)))
        elif not db_exists and original_file:
            print(
                'The original fasta file ({db_sequences}) from which the Blast DB was supposed to be produced is'
                ' present in Blast DB directory ({db_dir}), but no Blast DB files were found'
                ' there.'.format(db_sequences=db_sequences,
                                 db_dir=os.path.dirname(db_output)))
        else:
            run_successfully, _, _ = utils.runCommandPopenCommunicate([
                'makeblastdb', '-parse_seqids', '-dbtype', db_type, '-in',
                db_sequences, '-out', db_output
            ], False, None, True)
            if run_successfully:
                from shutil import copyfile
                copyfile(db_sequences, db_output)

    return run_successfully
Beispiel #15
0
def main():
    if sys.version_info[0] < 3:
        sys.exit('Must be using Python 3. Try calling "python3 get_stx_db.py"')

    parser = argparse.ArgumentParser(
        prog='get_stx_db.py',
        description=
        'Gets STX sequences from virulencefinder_db to produce a STX subtyping'
        ' DB',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--version',
                        help='Version information',
                        action='version',
                        version=str('%(prog)s v' + version))

    parser_optional_general = parser.add_argument_group(
        'General facultative options')
    parser_optional_general.add_argument(
        '-o',
        '--outdir',
        type=str,
        metavar='/path/to/output/directory/',
        help='Path to the directory where the sequences will be stored',
        required=False,
        default='.')

    args = parser.parse_args()

    start_time = time.time()

    args.outdir = os.path.abspath(args.outdir)
    if not os.path.isdir(args.outdir):
        os.makedirs(args.outdir)

    # Get virulencefinder_db
    url = 'https://bitbucket.org/genomicepidemiology/virulencefinder_db.git'
    virulencefinder_db = os.path.join(args.outdir, 'virulence_db', '')
    run_successfully, _, _ = utils.runCommandPopenCommunicate(
        ['git', 'clone', url, virulencefinder_db], False, None, True)
    _, commit, _ = utils.runCommandPopenCommunicate([
        'git', '-C', virulencefinder_db, 'log', '--pretty=format:"%h"', '-n',
        '1'
    ], True, 15, True)

    # Get STX sequences
    stx_seq = {}
    # stx_seq_write = []
    allowed_chars = set(Seq.IUPAC.IUPACData.unambiguous_dna_letters)
    with open(os.path.join(
            args.outdir,
            'virulence_db.virulence_ecoli.commit_{commit}.problematic_sequences.tab'
            .format(commit=commit)),
              'wt',
              newline='\n') as writer:
        for seq in SeqIO.parse(
                os.path.join(virulencefinder_db, 'virulence_ecoli.fsa'),
                'fasta'):
            if seq.id.lower().startswith('stx'):
                subtype = seq.id.split(':')
                if len(subtype) == 4:
                    if seq.id[:4] not in stx_seq:
                        stx_seq[seq.id[:4]] = []
                    '''
                    Jani
                    
                    After spending what seemed to be an endless amount of hours trying to solve the STEC stx subtype
                    mystery I've come to the following conclusion. For the platform we need to combine in the target db
                    stx2a,  stx2c and  stx2d as one subtype called stx2acd. This is due to the fact that all of these
                    subtypes are the most potent ones to cause HUS and cannot be separated from each other by the
                    methods in use right now.
                    '''
                    if subtype[0][:4] == 'stx2' and subtype[3] in [
                            'a', 'c', 'd'
                    ]:
                        subtype[3] = 'acd'

                    subtype = subtype[0][:4] + subtype[3]  # Define subtype
                    # if subtype not in stx_seq[seq_name[3]]:
                    #     stx_seq[seq_name[3]][subtype] = []
                    seq.description = ''  # To avoid description to be print in outfile

                    # For sequences with IUPAC codes, use one possible sequence based on the one with the codes
                    if not set(seq.seq.upper()).issubset(allowed_chars):
                        # print(seq.id, set(seq.seq.upper()))
                        all_possible_sequences = extend_ambiguous_dna(
                            seq.seq.upper())
                        if all_possible_sequences is not None:
                            seq = SeqRecord(
                                Seq.Seq(all_possible_sequences[0],
                                        generic_dna),
                                id='{seq_name}:IUPAC_codes_removed'.format(
                                    seq_name=seq.id),
                                description='')  # Change the sequence
                        else:
                            writer.write('\t'.join([
                                seq.id, 'Memory Error (too much IUPAC codes)'
                            ]))
                            continue

                    seq.id = '{seq_name}:seqTyping_{subtype}'.format(
                        seq_name=seq.id, subtype=subtype)
                    stx_seq[seq.id[:4]].append(seq)
                    # stx_seq_write.append(seq)

    # Write files
    for gene, seqs in stx_seq.items():
        with open(os.path.join(
                args.outdir,
                'virulence_db.virulence_ecoli.commit_{commit}.{gene}_subtyping.seq_typing.fasta'
                .format(commit=commit, gene=gene)),
                  'wt',
                  newline='\n') as writer:
            _ = SeqIO.write(seqs, writer, "fasta")

    # print(len(stx_seq))
    # for gene, subtype_dict in stx_seq.items():
    #     print(gene, len(subtype_dict))
    #     for subtype, seqs in subtype_dict.items():
    #         print(subtype, len(seqs))

    _ = utils.runTime(start_time)