コード例 #1
0
def resfinder_redmine(redmine_instance, issue, work_dir, description):
    # Unpickle Redmine objects
    redmine_instance = pickle.load(open(redmine_instance, 'rb'))
    issue = pickle.load(open(issue, 'rb'))
    description = pickle.load(open(description, 'rb'))

    try:
        # Parse description to figure out what SEQIDs we need to run on.
        seqids = list()
        for item in description:
            item = item.upper()
            seqids.append(item)

        retrieve_nas_files(seqids=seqids,
                           outdir=work_dir,
                           filetype='fasta',
                           copyflag=False)

        missing_fastas = verify_fasta_files_present(seqids, work_dir)
        if missing_fastas:
            redmine_instance.issue.update(resource_id=issue.id,
                                          notes='WARNING: Could not find the following requested SEQIDs on'
                                                ' the OLC NAS: {}'.format(missing_fastas))

        # Run ResFindr
        cmd = 'GeneSeekr blastn -s {seqfolder} -t {targetfolder} -r {reportdir} -A'\
            .format(seqfolder=work_dir,
                    targetfolder=os.path.join(COWBAT_DATABASES, 'resfinder'),
                    reportdir=os.path.join(work_dir, 'reports'))
        print(cmd)
        os.system(cmd)
        # Get the output file uploaded.
        output_list = list()
        output_dict = dict()
        output_dict['path'] = os.path.join(work_dir, 'reports', 'resfinder_blastn.xlsx')
        output_dict['filename'] = 'resfinder_blastn.xlsx'
        output_list.append(output_dict)
        redmine_instance.issue.update(resource_id=issue.id, uploads=output_list, status_id=4,
                                      notes='resfinder process complete!')

        # Clean up all FASTA/FASTQ files so we don't take up too much space on the NAS
        os.system('rm {workdir}/*fasta'.format(workdir=work_dir))
        try:
            shutil.rmtree(os.path.join(work_dir, 'reports'))
        except IOError:
            pass
    except Exception as e:
        redmine_instance.issue.update(resource_id=issue.id,
                                      notes='Something went wrong! Send this error traceback to your friendly '
                                            'neighborhood bioinformatician: {}'.format(e))
コード例 #2
0
def pointfinder_redmine(redmine_instance, issue, work_dir, description):
    sentry_sdk.init(SENTRY_DSN, before_send=before_send)
    # Unpickle Redmine objects
    redmine_instance = pickle.load(open(redmine_instance, 'rb'))
    issue = pickle.load(open(issue, 'rb'))
    description = pickle.load(open(description, 'rb'))
    # Parse description to get list of SeqIDs
    seqids = list()
    for i in range(0, len(description)):
        item = description[i]
        item = item.upper()
        # Minimal check to make sure IDs provided somewhat resemble a valid sample ID
        if item.isalpha():
            pass
        else:
            seqids.append(item)

    # Run Mash
    with open(os.path.join(work_dir, 'seqid.txt'), 'w') as f:
        for seqid in seqids:
            f.write(seqid + '\n')
    # Drop FASTA files into workdir
    retrieve_nas_files(seqids=seqids,
                       outdir=work_dir,
                       filetype='fasta',
                       copyflag=False)
    # Create output directory
    output_dir = os.path.join(work_dir, 'output')
    make_path(output_dir)
    # Get all of the FASTA files
    fasta_list = sorted(glob.glob(os.path.join(work_dir, '*.fasta')))
    # Set the folder to store all the PointFinder outputs
    pointfinder_output_dir = os.path.join(work_dir, 'pointfinder_outputs')
    # Initialise a dictionaries to store the mash-calculated, and pointfinder-formatted genus outputs for each strain
    genus_dict = dict()
    organism_dict = dict()
    # Create lists to store missing and unprocessed seqids
    unprocessed_seqs = list()
    missing_seqs = list()
    mash_fails = list()
    # Dictionary to convert the mash-calculated genus to the pointfinder format
    pointfinder_org_dict = {
        'Campylobacter': 'campylobacter',
        'Escherichia': 'e.coli',
        'Shigella': 'e.coli',
        '‎Mycobacterium': 'tuberculosis',
        'Neisseria': 'gonorrhoeae',
        'Salmonella': 'salmonella'
    }
    # Reverse look-up dictionary
    rev_org_dict = {
        'campylobacter': 'Campylobacter',
        'e.coli': 'Escherichia',
        'tuberculosis': 'Mycobacterium',
        'gonorrhoeae': 'Neisseria',
        'salmonella': 'Salmonella'
    }
    summary_dict = {
        'Salmonella': {
            'prediction': {
                'header':
                'Strain,Colitsin,Colistin,Spectinomycin,Quinolones,\n',
                'output':
                str(),
                'summary':
                os.path.join(pointfinder_output_dir,
                             'Salmonella_prediction_summary.csv')
            },
            'table': {
                'header':
                'Strain,parE,parC,gyrA,pmrB,pmrA,gyrB,16S_rrsD,23S,\n',
                'output':
                str(),
                'summary':
                os.path.join(pointfinder_output_dir,
                             'Salmonella_table_summary.csv')
            },
            'results': {
                'header':
                'Strain,Genus,Mutation,NucleotideChange,AminoAcidChange,Resistance,PMID,\n',
                'output':
                str(),
                'summary':
                os.path.join(pointfinder_output_dir,
                             'PointFinder_results_summary.csv')
            }
        },
        'Escherichia': {
            'prediction': {
                'header':
                'Strain,Colistin,GentamicinC,gentamicinC,Streptomycin,Macrolide,Sulfonamide,'
                'Tobramycin,Neomycin,Fluoroquinolones,Aminocoumarin,Tetracycline,KanamycinA,'
                'Spectinomycin,B-lactamResistance,Paromomycin,Kasugamicin,Quinolones,G418,'
                'QuinolonesAndfluoroquinolones,\n',
                'output':
                str(),
                'summary':
                os.path.join(pointfinder_output_dir,
                             'Escherichia_prediction_summary.csv')
            },
            'table': {
                'header':
                'Strain,parE,parC,folP,gyrA,pmrB,pmrA,16S_rrsB,16S_rrsH,gyrB,ampC,16S_rrsC,23S,\n',
                'output':
                str(),
                'summary':
                os.path.join(pointfinder_output_dir,
                             'Escherichia_table_summary.csv')
            },
            'results': {
                'header':
                'Strain,Genus,Mutation,NucleotideChange,AminoAcidChange,Resistance,PMID,\n',
                'output':
                str(),
                'summary':
                os.path.join(pointfinder_output_dir,
                             'PointFinder_results_summary.csv')
            }
        },
        'Campylobacter': {
            'prediction': {
                'header':
                'Strain,LowLevelIncreaseMIC,AssociatedWithT86Mutations,Macrolide,Quinolone,'
                'Streptinomycin,Erythromycin,IntermediateResistance,HighLevelResistance_'
                'nalidixic_and_ciprofloxacin,\n',
                'output':
                str(),
                'summary':
                os.path.join(pointfinder_output_dir,
                             'Campylobacter_prediction_summary.csv')
            },
            'table': {
                'header':
                'Strain,L22,rpsL,cmeR,gyrA,23S,\n',
                'output':
                str(),
                'summary':
                os.path.join(pointfinder_output_dir,
                             'Campylobacter_table_summary.csv')
            },
            'results': {
                'header':
                'Strain,Genus,Mutation,NucleotideChange,AminoAcidChange,Resistance,PMID,\n',
                'output':
                str(),
                'summary':
                os.path.join(pointfinder_output_dir,
                             'PointFinder_results_summary.csv')
            }
        }
    }

    # Run mash screen on each of the assemblies
    for item in fasta_list:
        seqid = os.path.splitext(os.path.basename(item))[0]
        screen_file = os.path.join(output_dir,
                                   '{seqid}_screen.tab'.format(seqid=seqid))
        mash.screen('/mnt/nas2/databases/confindr/databases/refseq.msh',
                    item,
                    threads=8,
                    w='',
                    i='0.95',
                    output_file=screen_file,
                    returncmd=True)
        screen_output = mash.read_mash_screen(screen_file)
        # Determine the genus from the screen output file
        for screen in screen_output:
            # Extract the genus from the mash results
            mash_organism = screen.query_id.split('/')[-3]
            # Use the organism as a key in the pointfinder database name conversion dictionary
            try:
                mash_genus = pointfinder_org_dict[mash_organism]
            except KeyError:
                mash_genus = 'NA'
            # Populate the dictionaries with the seqid, and the calculated genus/pointfinder name
            genus_dict[seqid] = mash_genus
            organism_dict[seqid] = mash_organism
    # Delete all of the FASTA files
    for fasta in fasta_list:
        os.remove(fasta)
    # # Delete the output folder
    # shutil.rmtree(output_dir)

    # Pointfinder
    # These unfortunate hard coded paths appear to be necessary
    activate = 'source /home/ubuntu/miniconda3/bin/activate /mnt/nas2/virtual_environments/pointfinder'
    pointfinder_py = '/mnt/nas2/virtual_environments/pointfinder/pointfinder-3.0/pointfinder-3.0.py'
    # Database locations
    pointfinder_db = '/mnt/nas2/databases/assemblydatabases/0.3.4/pointfinder'
    # List of organisms in the pointfinder database
    pointfinder_list = [
        'campylobacter', 'e.coli', 'tuberculosis', 'gonorrhoeae', 'salmonella'
    ]
    try:
        os.mkdir(pointfinder_output_dir)
    except FileExistsError:
        pass
    # Pointfinder cannot handle an entire folder of sequences; each sample must be processed independently
    for seqid in sorted(seqids):
        # If the seqid isn't present in the dictionary, it is because the assembly could not be found - or because
        # MASH screen failed
        try:
            # Look up the PointFinder and the MASH-calculated genera
            pointfinder_genus = genus_dict[seqid]
            genus = rev_org_dict[pointfinder_genus]
            # If the genus isn't in the pointfinder database, do not attempt to process it
            if pointfinder_genus in pointfinder_list:
                # Create folder to drop FASTA files
                assembly_folder = os.path.join(work_dir, seqid)
                make_path(assembly_folder)
                # Extract FASTA files.
                retrieve_nas_files(seqids=[seqid],
                                   outdir=assembly_folder,
                                   filetype='fasta',
                                   copyflag=False)
                fasta = os.path.join(assembly_folder,
                                     '{seqid}.fasta'.format(seqid=seqid))
                # Prepare command
                cmd = 'python {py} -i {fasta} -s {orgn} -p {db} -o {output} -m blastn -m_p {blast_path}'\
                    .format(py=pointfinder_py,
                            fasta=fasta,
                            orgn=pointfinder_genus,
                            db=pointfinder_db,
                            output=pointfinder_output_dir,
                            blast_path='/mnt/nas2/virtual_environments/pointfinder/bin/blastn'
                            )
                # Create another shell script to execute within the PlasmidExtractor conda environment
                template = "#!/bin/bash\n{} && {}".format(activate, cmd)
                pointfinder_script = os.path.join(work_dir,
                                                  'run_pointfinder.sh')
                with open(pointfinder_script, 'w+') as file:
                    file.write(template)
                # Modify the permissions of the script to allow it to be run on the node
                make_executable(pointfinder_script)
                # Run shell script
                os.system(pointfinder_script)
                # Find the pointfinder outputs
                summary_dict[genus]['prediction']['output'] = \
                    glob.glob(os.path.join(pointfinder_output_dir, '{seq}*prediction.txt'.format(seq=seqid)))[0]
                summary_dict[genus]['table']['output'] = \
                    glob.glob(os.path.join(pointfinder_output_dir, '{seq}*table.txt'.format(seq=seqid)))[0]
                summary_dict[genus]['results']['output'] = \
                    glob.glob(os.path.join(pointfinder_output_dir, '{seq}*results.txt'.format(seq=seqid)))[0]
                # Process the predictions
                write_report(summary_dict=summary_dict,
                             seqid=seqid,
                             genus=genus,
                             key='prediction')
                # Process the results summary
                write_report(summary_dict=summary_dict,
                             seqid=seqid,
                             genus=genus,
                             key='results')
                # Process the table summary
                write_table_report(summary_dict=summary_dict,
                                   seqid=seqid,
                                   genus=genus)
            else:
                unprocessed_seqs.append(seqid)
        except KeyError:
            if not os.path.isfile(
                    os.path.join(output_dir,
                                 '{seq}_screen.tab'.format(seq=seqid))):
                missing_seqs.append(seqid)
            else:
                mash_fails.append(seqid)
    # Attempt to clear out the tmp folder from the pointfinder_output_dir
    try:
        shutil.rmtree(os.path.join(pointfinder_output_dir, 'tmp'))
    except FileNotFoundError:
        pass
    # Zip output
    output_filename = 'pointfinder_output'
    zip_filepath = zip_folder(results_path=pointfinder_output_dir,
                              output_dir=work_dir,
                              output_filename=output_filename)
    zip_filepath += '.zip'
    # Prepare upload
    output_list = [{
        'filename': os.path.basename(zip_filepath),
        'path': zip_filepath
    }]

    # Create a note to add to the updated Redmine issue
    notes = 'Pointfinder process complete!'
    # If there are missing, or unprocessed sequences, add details to the note
    if unprocessed_seqs:
        seq_list = list()
        for sequence in unprocessed_seqs:
            seq_list.append('{seqid} ({organism})'.format(
                seqid=sequence, organism=organism_dict[sequence]))
        if len(unprocessed_seqs) > 1:
            notes += '\n The following sequences were not processed, as they were determined to be genera not ' \
                     'present in the pointfinder database: {seqs}'.format(seqs=', '.join(seq_list))
        else:
            notes += '\n The following sequence was not processed, as it was determined to be a genus not ' \
                     'present in the pointfinder database: {seqs}'.format(seqs=', '.join(seq_list))
    if missing_seqs:
        if len(missing_seqs) > 1:
            notes += '\n The following sequences were not processed, as they could not be located in the strain ' \
                     'database: {seqs}'.format(seqs=', '.join(missing_seqs))
        else:
            notes += '\n The following sequence was not processed, as it could not be located in the strain database:' \
                     ' {seqs}'.format(seqs=', '.join(missing_seqs))
    if mash_fails:
        if len(mash_fails) > 1:
            notes += '\n The following sequences could not be processed by MASH screen: {seqs}'\
                .format(seqs=', '.join(mash_fails))
        else:
            notes += '\n The following sequence could not be processed by MASH screen: {seqs}'\
                .format(seqs=', '.join(mash_fails))
    # Create a list of all the folders - will be used to clean up the working directory
    folders = glob.glob(os.path.join(work_dir, '*/'))
    # Remove all the folders
    for folder in folders:
        if os.path.isdir(folder):
            shutil.rmtree(folder)
    # Wrap up issue
    redmine_instance.issue.update(resource_id=issue.id,
                                  uploads=output_list,
                                  status_id=4,
                                  notes=notes)
コード例 #3
0
def prokka_redmine(redmine_instance, issue, work_dir, description):
    try:
        # Unpickle Redmine objects
        redmine_instance = pickle.load(open(redmine_instance, 'rb'))
        issue = pickle.load(open(issue, 'rb'))
        description = pickle.load(open(description, 'rb'))

        # Parse description to get list of SeqIDs
        seqids = []
        for i in range(0, len(description)):
            item = description[i]
            item = item.upper()

            # Minimal check to make sure IDs provided somewhat resemble a valid sample ID
            seqids.append(item)

        # Create folder to drop FASTQ files
        assemblies_folder = os.path.join(work_dir, 'assemblies')
        os.mkdir(assemblies_folder)

        # Create output folder
        output_folder = os.path.join(work_dir, 'output')
        os.makedirs(output_folder)

        # Extract FASTQ files.
        retrieve_nas_files(seqids=seqids,
                           outdir=assemblies_folder,
                           filetype='fasta',
                           copyflag=False)
        missing_fastas = verify_fasta_files_present(seqids, assemblies_folder)
        if missing_fastas:
            redmine_instance.issue.update(
                resource_id=issue.id,
                notes=
                'WARNING: Could not find the following requested SEQIDs on '
                'the OLC NAS: {}'.format(missing_fastas))

        # These unfortunate hard coded paths appear to be necessary
        activate = 'source /home/ubuntu/miniconda3/bin/activate /mnt/nas2/virtual_environments/prokka'
        prokka = '/mnt/nas2/virtual_environments/prokka/bin/prokka'

        for assembly in glob.glob(os.path.join(assemblies_folder, '*.fasta')):
            seqid = os.path.split(assembly)[1].split('.')[0]
            # Prepare command
            cmd = '{prokka} --outdir {output_folder} --prefix {seqid} {assembly}'.format(
                prokka=prokka,
                output_folder=os.path.join(output_folder, seqid),
                seqid=seqid,
                assembly=assembly)

            # Create another shell script to execute within the PlasmidExtractor conda environment
            template = "#!/bin/bash\n{} && {}".format(activate, cmd)
            prokka_script = os.path.join(work_dir, 'run_prokka.sh')
            with open(prokka_script, 'w+') as file:
                file.write(template)
            make_executable(prokka_script)

            # Run shell script
            os.system(prokka_script)

        # Zip output
        output_filename = 'prokka_output_{}'.format(issue.id)
        zip_filepath = zip_folder(results_path=output_folder,
                                  output_dir=work_dir,
                                  output_filename=output_filename)
        zip_filepath += '.zip'

        upload_successful = upload_to_ftp(local_file=zip_filepath)
        # Prepare upload
        if upload_successful:
            redmine_instance.issue.update(
                resource_id=issue.id,
                status_id=4,
                notes='Prokka process complete!\n\n'
                'Results are available at the following FTP address:\n'
                'ftp://ftp.agr.gc.ca/outgoing/cfia-ak/{}'.format(
                    os.path.split(zip_filepath)[1]))
        else:
            redmine_instance.issue.update(
                resource_id=issue.id,
                status_id=4,
                notes=
                'Upload of result files was unsuccessful due to FTP connectivity issues. '
                'Please try again later.')
        # Clean up files
        shutil.rmtree(output_folder)
        os.remove(zip_filepath)
    except Exception as e:
        redmine_instance.issue.update(
            resource_id=issue.id,
            notes=
            'Something went wrong! Send this error traceback to your friendly '
            'neighborhood bioinformatician: {}'.format(e))
コード例 #4
0
def ec_typer_redmine(redmine_instance, issue, work_dir, description):
    sentry_sdk.init(SENTRY_DSN, before_send=before_send)
    try:
        # Unpickle Redmine objects
        redmine_instance = pickle.load(open(redmine_instance, 'rb'))
        issue = pickle.load(open(issue, 'rb'))
        description = pickle.load(open(description, 'rb'))

        # Parse description to get list of SeqIDs
        seqids = []
        for i in range(0, len(description)):
            item = description[i]
            item = item.upper()

            # Minimal check to make sure IDs provided somewhat resemble a valid sample ID
            seqids.append(item)

        # Create folder to drop FASTQ files
        assemblies_folder = os.path.join(work_dir, 'assemblies')
        os.mkdir(assemblies_folder)

        # Create output folder
        output_folder = os.path.join(work_dir, 'output')
        os.makedirs(output_folder)

        # Extract FASTQ files.
        retrieve_nas_files(seqids=seqids,
                           outdir=assemblies_folder,
                           filetype='fasta',
                           copyflag=False)
        missing_fastas = verify_fasta_files_present(seqids, assemblies_folder)
        if missing_fastas:
            redmine_instance.issue.update(
                resource_id=issue.id,
                notes=
                'WARNING: Could not find the following requested SEQIDs on '
                'the OLC NAS: {}'.format(missing_fastas))

        # These unfortunate hard coded paths appear to be necessary
        activate = 'source /home/ubuntu/miniconda3/bin/activate /mnt/nas2/virtual_environments/cowbat'
        ectyper = '/mnt/nas2/virtual_environments/cowbat/bin/ectyper'

        # Prepare command
        cmd = '{ectyper} -i {input_folder} -o {output_folder}'.format(
            ectyper=ectyper,
            input_folder=assemblies_folder,
            output_folder=output_folder)

        # Create another shell script to execute within the PlasmidExtractor conda environment
        template = "#!/bin/bash\n{} && {}".format(activate, cmd)
        ec_script = os.path.join(work_dir, 'run_ec_typer.sh')
        with open(ec_script, 'w+') as file:
            file.write(template)
        make_executable(ec_script)

        # Run shell script
        os.system(ec_script)

        # Get the output file uploaded.
        output_list = list()
        output_dict = dict()
        # Add the reports separately to the output list
        # GeneSeekr Excel-formatted report
        output_dict['path'] = os.path.join(output_folder, 'output.tsv')
        output_dict['filename'] = 'ec_typer_report.tsv'
        output_list.append(output_dict)
        redmine_instance.issue.update(resource_id=issue.id,
                                      uploads=output_list,
                                      status_id=4,
                                      notes='ECTyper analyses complete!')
        # Clean up files
        shutil.rmtree(output_folder)
        shutil.rmtree(assemblies_folder)
    except Exception as e:
        sentry_sdk.capture_exception(e)
        redmine_instance.issue.update(
            resource_id=issue.id,
            notes=
            'Something went wrong! We log this automatically and will look into the '
            'problem and get back to you with a fix soon.')
コード例 #5
0
def snvphyl_redmine(redmine_instance, issue, work_dir, description):
    sentry_sdk.init(SENTRY_DSN, before_send=before_send)
    # Unpickle Redmine objects
    redmine_instance = pickle.load(open(redmine_instance, 'rb'))
    issue = pickle.load(open(issue, 'rb'))
    description = pickle.load(open(description, 'rb'))

    try:
        query_list = list()
        reference = list()
        compare = False
        # Go through description to figure out what our query is and what the reference is.
        for item in description:
            item = item.upper()
            if item == '':
                continue
            if 'COMPARE' in item:
                compare = True
                continue
            if compare:
                query_list.append(item)
            else:
                if 'REFERENCE' not in item:
                    reference.append(item)

        # Retrieve our reference file. Error user if they selected anything but one reference and don't continue.
        if len(reference) != 1:
            redmine_instance.issue.update(resource_id=issue.id,
                                          notes='ERROR: You must specify one reference strain, and you '
                                                'specified {} reference strains. Please create a new'
                                                ' issue and try again.'.format(len(reference)), status_id=4)
            return

        if reference[0].upper() != 'ATTACHED':
            # Extract our reference file to our working directory.
            retrieve_nas_files(seqids=reference,
                               outdir=work_dir,
                               filetype='fasta',
                               copyflag=True)
            # Check that the file was successfully extracted. If it wasn't boot the user.
            if len(glob.glob(os.path.join(work_dir, '*fasta'))) == 0:
                redmine_instance.issue.update(resource_id=issue.id,
                                              notes='ERROR: Could not find the specified reference file.'
                                                    ' Please verify it is a correct SEQID, create a new '
                                                    'issue, and try again.', status_id=4)
                return

        # If user specified attachment as the reference file, download it to our working directory.
        else:
            # Get the attachment ID, and download if it isn't equal to zero (meaning no attachment, so boot user with
            # appropriate error message)
            attachment = redmine_instance.issue.get(issue.id, include='attachments')
            attachment_id = 0
            for item in attachment.attachments:
                attachment_id = item.id

            # Download if we found an attachment, and use as our reference. Otherwise, exit and tell user to try again
            if attachment_id != 0:
                attachment = redmine_instance.attachment.get(attachment_id)
                attachment.download(savepath=work_dir, filename='reference.fasta')
            else:
                redmine_instance.issue.update(resource_id=issue.id,
                                              notes='ERROR: You specified that the reference would be in attached file,'
                                                    ' but no attached file was found. Please create a new issue and '
                                                    'try again.',
                                              status_id=4)
                return

        # Now extract our query files.
        retrieve_nas_files(seqids=query_list,
                           outdir=os.path.join(work_dir, 'fastqs'),
                           filetype='fastq',
                           copyflag=True)

        # With our query files extracted, verify that all the SEQIDs the user specified were able to be found.
        missing_fastqs = verify_fastqs_present(query_list, os.path.join(work_dir, 'fastqs'))
        if len(missing_fastqs) > 0:
            redmine_instance.issue.update(resource_id=issue.id,
                                          notes='Warning! Could not find the following requested query SEQIDs: '
                                                '{}. \nYou may want to verify the SEQIDs, create a new issue, and try'
                                                ' again.'.format(str(missing_fastqs)))

        # Now check that the FASTQ files aren't too far away from the specified reference file.
        bad_fastqs = check_distances(ref_fasta=glob.glob(os.path.join(work_dir, '*fasta'))[0],
                                     fastq_folder=os.path.join(work_dir, 'fastqs'),
                                     work_dir=work_dir)
        if len(bad_fastqs) > 0:
            redmine_instance.issue.update(resource_id=issue.id,
                                          notes='Warning! The following SEQIDs were found to be fairly'
                                                ' divergent from the reference file specified:{} \nYou may'
                                                ' want to start a new SNVPhyl issue without them and try '
                                                'again.'.format(str(bad_fastqs)))

        # With everything checked, time to actually run the SNVPhyl. Need to call a snvphyl-specific virtualenv.
        cmd = '/mnt/nas/Virtual_Environments/snvphylcli/bin/python /mnt/nas/slurmtest/snvphyl-galaxy-cli/bin/snvphyl.py' \
              ' --deploy-docker --fastq-dir {fastq_dir} ' \
              '--reference-file {ref_file} --min-coverage 5 --output-dir {output} ' \
              '--docker-port {port}'.format(fastq_dir=os.path.join(work_dir, 'fastqs'),
                                            ref_file=glob.glob(os.path.join(work_dir, '*fasta'))[0],
                                            output=os.path.join(work_dir, 'output'),
                                            port=issue.id)
        os.system(cmd)

        # Now need to create a zip archive of the results file, upload it, and clean up the fastq files.
        shutil.make_archive(os.path.join(work_dir, 'SNVPhyl_' + str(issue.id)), 'zip', os.path.join(work_dir, 'output'))
        output_list = list()
        output_dict = dict()
        output_dict['path'] = os.path.join(work_dir, 'SNVPhyl_' + str(issue.id) + '.zip')
        output_dict['filename'] = 'SNVPhyl_' + str(issue.id) + '.zip'
        output_list.append(output_dict)
        redmine_instance.issue.update(resource_id=issue.id, uploads=output_list, status_id=4,
                                      notes='SNVPhyl process complete!')

        shutil.rmtree(os.path.join(work_dir, 'fastqs'))
    except Exception as e:
        sentry_sdk.capture_exception(e)
        redmine_instance.issue.update(resource_id=issue.id,
                                      notes='Something went wrong! We log this automatically and will look into the '
                                            'problem and get back to you with a fix soon.')
コード例 #6
0
def ecgf(redmine_instance, issue, work_dir, description):
    """
    """
    # Unpickle Redmine objects
    redmine_instance = pickle.load(open(redmine_instance, 'rb'))
    issue = pickle.load(open(issue, 'rb'))
    description = pickle.load(open(description, 'rb'))

    try:
        # Description should just be a list of SEQIDs. Get the fasta files associated with them extracted
        # to the bio_request dir
        retrieve_nas_files(seqids=description,
                           outdir=os.path.join(work_dir, 'fastas'),
                           filetype='fasta')
        fasta_files = glob.glob(os.path.join(work_dir, 'fastas', '*.fasta'))
        # Verify that specified fasta files are actually there, warn user if they aren't.
        missing_fastas = verify_fasta_files_present(seqid_list=description,
                                                    fasta_dir=os.path.join(
                                                        work_dir, 'fastas'))
        if len(missing_fastas) > 0:
            redmine_instance.issue.update(
                resource_id=issue.id,
                notes='WARNING: Could not find the following requested SEQIDs on'
                ' the OLC NAS: {}'.format(missing_fastas))

        # Make output dir
        output_dir = os.path.join(work_dir, 'results')
        if not os.path.isdir(output_dir):
            os.makedirs(output_dir)
        # These unfortunate hard coded paths appear to be necessary
        activate = 'source /home/ubuntu/miniconda3/bin/activate /mnt/nas2/virtual_environments/ecgf'
        # As the files are processed one at a time, create a list of all the reports in order to create a summary report
        reports = list()
        for fasta in sorted(fasta_files):
            seqid = os.path.split(fasta)[-1].split('.')[0]
            report = os.path.join(output_dir,
                                  '{seqid}.csv'.format(seqid=seqid))
            reports.append(report)
            # Create the command line call to eCGF
            cmd = 'eCGF {fasta} {csv}'.format(fasta=fasta, csv=report)
            # Create another shell script to execute within the conda environment
            template = "#!/bin/bash\n{activate} && {cmd}".format(
                activate=activate, cmd=cmd)
            ecgf_script = os.path.join(work_dir, 'run_ecgf.sh')
            with open(ecgf_script, 'w+') as file:
                file.write(template)
            # Modify the permissions of the script to allow it to be run on the node
            make_executable(ecgf_script)
            # Run shell script
            os.system(ecgf_script)
        # Create a summary report of all the individual reports
        header = str()
        data = str()
        for report in reports:
            with open(report, 'r') as summary:
                if not header:
                    header = summary.readline()
                else:
                    next(summary)
                for line in summary:
                    # Create a list of the entries by splitting on ,
                    line_list = line.split(',')
                    # Remove the path and extension from the file name
                    line_list[0] = os.path.basename(
                        os.path.splitext(line_list[0])[0])
                    data += ','.join(line_list)
        summary_report = os.path.join(
            output_dir, '{id}_summary_report.csv'.format(id=str(issue.id)))
        with open(summary_report, 'w') as summary:
            summary.write(header)
            summary.write(data)
        zip_filepath = os.path.join(
            work_dir, 'eCGF_output_{id}'.format(id=str(issue.id)))
        # With eCGF done, zip up the results folder
        shutil.make_archive(root_dir=output_dir,
                            format='zip',
                            base_name=zip_filepath)
        # Prepare upload
        output_list = [{
            'filename': os.path.basename(zip_filepath) + '.zip',
            'path': zip_filepath + '.zip'
        }]
        # Wrap up issue
        redmine_instance.issue.update(resource_id=issue.id,
                                      uploads=output_list,
                                      status_id=4,
                                      notes='Analysis with eCGF complete!')
        # And finally, do some file cleanup.
        try:
            shutil.rmtree(output_dir)
            shutil.rmtree(os.path.join(work_dir, 'fastas'))
            os.remove(zip_filepath + '.zip')
        except:
            pass

    except Exception as e:
        redmine_instance.issue.update(
            resource_id=issue.id,
            notes=
            'Something went wrong! Send this error traceback to your friendly '
            'neighborhood bioinformatician: {}'.format(e))
コード例 #7
0
def closerelatives_redmine(redmine_instance, issue, work_dir, description):
    sentry_sdk.init(SENTRY_DSN, before_send=before_send)
    # Unpickle Redmine objects
    redmine_instance = pickle.load(open(redmine_instance, 'rb'))
    issue = pickle.load(open(issue, 'rb'))
    description = pickle.load(open(description, 'rb'))

    try:
        # First line of description should be number of close relatives desired.
        try:
            num_close_relatives = int(description[0])
        except ValueError:
            redmine_instance.issue.update(resource_id=issue.id,
                                          notes='Error! The first line of the description must be the number'
                                                ' of strains you want to find. The first line of your '
                                                'description was: {}'.format(description[0]),
                                          status_id=4)
            return

        # Second line of description should be the SEQID of what you want to find a close reference for.
        seqid = description[1]

        # Try to extract FASTA files for the specified SEQID.
        retrieve_nas_files(seqids=[seqid],
                           outdir=os.path.join(work_dir, 'fasta'),
                           filetype='fasta',
                           copyflag=False)
        if len(glob.glob(os.path.join(work_dir, 'fasta', '*.fasta'))) != 1:
            redmine_instance.issue.update(resource_id=issue.id,
                                          notes='Error! Could not find FASTA file for the specified SEQID. The SEQID'
                                                ' that you specified was: {}'.format(seqid),
                                          status_id=4)
            return

        # Run mash dist with the FASTQ file specified against the sketch of all our stuff.
        query_fasta = glob.glob(os.path.join(work_dir, 'fasta', '*.fasta'))[0]
        mash.dist(query_fasta, '/mnt/nas2/redmine/bio_requests/14674/all_sequences.msh',
                  threads=8, output_file=os.path.join(work_dir, 'distances.tab'))
        mash_results = mash.read_mash_output(os.path.join(work_dir, 'distances.tab'))
        result_dict = dict()
        # Put all the results into a dictionary, where the key is the sequence file and the value is mash distance
        # between query fastq and reference fastq.
        for item in mash_results:
            seq_name = os.path.split(item.query)[-1].split('_')[0]
            result_dict[seq_name] = item.distance

        # Sort the results, store the sorted dictionary keys in a list.
        sorted_distance_results = sorted(result_dict, key=result_dict.get)

        # Prepare a string that lists the top hit SEQIDs to be posted to redmine.
        upload_string = ''
        for i in range(num_close_relatives):
            upload_string = upload_string + sorted_distance_results[i].replace('.fasta', '') + ' (' + str(result_dict[sorted_distance_results[i]]) + ')\n'

        # Also make a CSV file of all results, in case someone wants to take a closer look.
        with open(os.path.join(work_dir, 'close_relatives_results.csv'), 'w') as f:
            f.write('Strain,MashDistance\n')
            for seq in sorted_distance_results:
                f.write('{},{}\n'.format(seq.replace('.fasta', ''), result_dict[seq]))

        output_list = [
            {
                'path': os.path.join(work_dir, 'close_relatives_results.csv'),
                'filename': 'close_relatives_results.csv'
            }
        ]
        # Post the list of closely related SEQIDs to redmine, as well as the CSV result file.
        redmine_instance.issue.update(resource_id=issue.id,
                                      notes='Process complete! Here is the list of the {num_relatives} closest strains '
                                            'to {query_strain} (mash distance between query and result in brackets):'
                                            '\n{upload_string}'.format(num_relatives=str(num_close_relatives),
                                                                       query_strain=seqid,
                                                                       upload_string=upload_string),
                                      status_id=4,
                                      uploads=output_list)

    except Exception as e:
        sentry_sdk.capture_exception(e)
        redmine_instance.issue.update(resource_id=issue.id,
                                      notes='Something went wrong! We log this automatically and will look into the '
                                            'problem and get back to you with a fix soon.')
コード例 #8
0
def mob_suite(redmine_instance, issue, work_dir, description):
    """
    """
    # Unpickle Redmine objects
    redmine_instance = pickle.load(open(redmine_instance, 'rb'))
    issue = pickle.load(open(issue, 'rb'))
    description = pickle.load(open(description, 'rb'))

    try:
        # Description should just be a list of SEQIDs. Get the fasta files associated with them extracted
        # to the bio_request dir
        retrieve_nas_files(
            seqids=description,
            outdir=os.path.join(work_dir, 'fastas'),
            filetype='fasta',
            copyflag=True
        )  # Since we're docker-ing need to copy. Files get cleaned up at end of process
        # Now we need to run mob_recon (and typing!) on each of the fasta files requested. Put all results into one
        # folder (this will need to be uploaded to FTP - will overwhelm max (10MB) file size limit on Redmine

        fasta_files = glob.glob(os.path.join(work_dir, 'fastas', '*.fasta'))
        # Verify that specified fasta files are actually there, warn user if they aren't.
        missing_fastas = verify_fasta_files_present(seqid_list=description,
                                                    fasta_dir=os.path.join(
                                                        work_dir, 'fastas'))
        if len(missing_fastas) > 0:
            redmine_instance.issue.update(
                resource_id=issue.id,
                notes='WARNING: Could not find the following requested SEQIDs on'
                ' the OLC NAS: {}'.format(missing_fastas))

        # Make output dir
        output_dir = os.path.join(work_dir, 'mob_suite_results')
        if not os.path.isdir(output_dir):
            os.makedirs(output_dir)

        for fasta in fasta_files:
            seqid = os.path.split(fasta)[-1].split('.')[0]
            # Run mobsuite via docker, since I can't seem to make it work with slurm any other way.
            cmd = 'docker run --rm -i -u $(id -u) -v /mnt/nas2:/mnt/nas2 mob_suite:latest /bin/bash -c "source activate ' \
                  '/mnt/nas2/virtual_environments/mob_suite && mob_recon -i {input_fasta} -o {output_dir} ' \
                  '--run_typer"'.format(input_fasta=fasta,
                                        output_dir=os.path.join(output_dir, seqid))
            os.system(cmd)

        # With mobsuite done, zip up the results folder and upload to the FTP.
        shutil.make_archive(root_dir=output_dir,
                            format='zip',
                            base_name=os.path.join(work_dir, str(issue.id)))

        upload_successful = upload_to_ftp(
            local_file=os.path.join(work_dir,
                                    str(issue.id) + '.zip'))

        # And finally, do some file cleanup.
        try:
            shutil.rmtree(output_dir)
            shutil.rmtree(os.path.join(work_dir, 'fastas'))
            os.remove(os.path.join(work_dir, str(issue.id) + '.zip'))
        except:
            pass

        if upload_successful:
            redmine_instance.issue.update(
                resource_id=issue.id,
                status_id=4,
                notes='Mob-suite process complete!\n\n'
                'Results are available at the following FTP address:\n'
                'ftp://ftp.agr.gc.ca/outgoing/cfia-ak/{}'.format(
                    str(issue.id) + '.zip'))
        else:
            redmine_instance.issue.update(
                resource_id=issue.id,
                status_id=4,
                notes=
                'Upload of result files was unsuccessful due to FTP connectivity issues. '
                'Please try again later.')

    except Exception as e:
        redmine_instance.issue.update(
            resource_id=issue.id,
            notes=
            'Something went wrong! Send this error traceback to your friendly '
            'neighborhood bioinformatician: {}'.format(e))
コード例 #9
0
def hybridassembly_redmine(redmine_instance, issue, work_dir, description):
    # Unpickle Redmine objects
    redmine_instance = pickle.load(open(redmine_instance, 'rb'))
    issue = pickle.load(open(issue, 'rb'))
    description = pickle.load(open(description, 'rb'))

    try:
        sequence_folder = description[0]

        hybrid_info = list()
        for i in range(1, len(description)):
            x = description[i].rstrip().split(',')
            minion_id = x[0]
            seqid = x[1]
            oln_id = x[2]
            hybrid_info.append([minion_id, seqid, oln_id])

        local_folder = os.path.join('/mnt/nas2/raw_sequence_data/nanopore', sequence_folder)

        download_dir(ftp_dir=sequence_folder,
                     local_dir=local_folder)

        seqids = list()
        for item in hybrid_info:
            seqids.append(item[1])

        # Link the FASTQs needed to run hybrid assemblies to our working dir.
        retrieve_nas_files(seqids=seqids,
                           outdir=os.path.join(work_dir, 'fastqs'),
                           filetype='fastq')

        # TODO: Figure out conda env activation, it's always a pain
        # Now need to run the hybrid_assembly.py script, which will run Unicycler.
        for item in hybrid_info:
            minion_reads = glob.glob(os.path.join(local_folder, item[0] + '*'))[0]
            illumina_forward = glob.glob(os.path.join(work_dir, 'fastqs', item[1] + '*_R1*'))[0]
            illumina_reverse = glob.glob(os.path.join(work_dir, 'fastqs', item[1] + '*_R1*'))[0]
            output_dir = os.path.join(work_dir, item[2])
            cmd = 'python /mnt/nas/Redmine/OLCRedmineAutomator/automators/hybrid_assembly.py -1 {forward} -2 {reverse} ' \
                  '-p {minion} -o {output} -n {name}'.format(forward=illumina_forward,
                                                             reverse=illumina_reverse,
                                                             minion=minion_reads,
                                                             output=output_dir,
                                                             name=item[2])
            os.system(cmd)

        # Now that this is done, need to make a report that looks at least a bit like our old combined metadata report


        # At this point, zip folder has been created (hopefully) called issue_id.zip in biorequest dir. Upload that
        # to the FTP.
        s = FTP('ftp.agr.gc.ca', user=FTP_USERNAME, passwd=FTP_PASSWORD)
        s.cwd('outgoing/cfia-ak')
        f = open(os.path.join(work_dir, str(issue.id) + '.zip'), 'rb')
        s.storbinary('STOR {}.zip'.format(str(issue.id)), f)
        f.close()
        s.quit()

        # Make redmine tell Paul that a run has finished and that we should add things to our DB so things don't get missed
        # to be made. assinged_to_id to use is 226. Priority is 3 (High).
        redmine_instance.issue.update(resource_id=issue.id,
                                      assigned_to=226,
                                      notes='Hybrid assembly complete. Please add it to the OLC Database.\n')

    except Exception as e:
        redmine_instance.issue.update(resource_id=issue.id,
                                      notes='Something went wrong! Send this error traceback to your friendly '
                                            'neighborhood bioinformatician: {}'.format(e))
        print(traceback.print_exc())
コード例 #10
0
def strainmash_redmine(redmine_instance, issue, work_dir, description):
    # Unpickle Redmine objects
    redmine_instance = pickle.load(open(redmine_instance, 'rb'))
    issue = pickle.load(open(issue, 'rb'))
    description = pickle.load(open(description, 'rb'))

    # Reference path
    typestrain_db_path = '/mnt/nas/Databases/GenBank/typestrains/typestrains_sketch.msh'

    # Parse description to get list of SeqIDs
    seqids = []
    for i in range(0, len(description)):
        item = description[i]
        item = item.upper()
        seqids.append(item)

    with open(os.path.join(work_dir, 'seqid.txt'), 'w') as f:
        for seqid in seqids:
            f.write(seqid + '\n')

    # Drop FASTA files into workdir
    retrieve_nas_files(seqids=seqids,
                       outdir=work_dir,
                       filetype='fasta',
                       copyflag=False)

    # Create output directory
    output_dir = os.path.join(work_dir, 'output')
    os.mkdir(output_dir)

    # Get all of the FASTA files
    fasta_list = glob.glob(os.path.join(work_dir, '*.fasta'))

    # Run mash_screen on everything
    for item in fasta_list:
        output_filepath = os.path.join(
            output_dir, (item.replace('.fasta', '') + '_strainmash.txt'))

        mash_screen(reference=typestrain_db_path,
                    queryfile=item,
                    outname=output_filepath)

    # Move all files to the actual output folder - this is a workaround to a weird bug
    output_files = glob.glob(os.path.join(work_dir, '*strainmash.txt'))
    for file in output_files:
        os.rename(file, os.path.join(output_dir, os.path.basename(file)))

    # Zip output folder
    shutil.make_archive(output_dir, 'zip', work_dir, 'output')

    # Glob zip
    zip_file = glob.glob(os.path.join(work_dir, '*.zip'))[0]

    # Output list containing dictionaries with file path as the key
    output_list = [{
        'path': os.path.join(work_dir, zip_file),
        'filename': os.path.basename(zip_file)
    }]

    # Upload files, set status to Feedback
    redmine_instance.issue.update(
        resource_id=issue.id,
        uploads=output_list,
        status_id=4,
        notes='STRAINMASH complete. See attached file for results.')

    # Delete all of the FASTA files
    for fasta in fasta_list:
        os.remove(fasta)

    # Delete the output folder
    shutil.rmtree(output_dir)
コード例 #11
0
def diversitree_redmine(redmine_instance, issue, work_dir, description):
    # Unpickle Redmine objects
    redmine_instance = pickle.load(open(redmine_instance, 'rb'))
    issue = pickle.load(open(issue, 'rb'))
    description = pickle.load(open(description, 'rb'))

    try:
        if not os.path.isdir(os.path.join(work_dir, 'fastas')):
            os.makedirs(os.path.join(work_dir, 'fastas'))
        # Check that the first line of the request is a number. If it isn't, tell author they goofed and give up.
        try:
            desired_num_strains = int(description[0])
        except ValueError:
            redmine_instance.issue.update(
                resource_id=issue.id,
                notes=
                'Error! The first line of your request must be the number of'
                ' strains you want picked from the tree.',
                status_id=4)
            return

        # Parse description for SEQIDs, write list that file_extractor needs.
        seqids = list()
        for i in range(1, len(description)):
            item = description[i].upper()
            seqids.append(item)

        if 'TREEPROGRAM' in seqids[-1]:
            treemaker = seqids[-1].split('=')[1].lower()
            seqids.pop()
            if treemaker not in ['parsnp', 'mashtree']:
                redmine_instance.issue.update(
                    resource_id=issue.id,
                    notes=
                    'Error! Available tree creation programs are mashtree and parsnp. '
                    'Your choice was {}'.format(treemaker),
                    status_id=4)
                return
        else:
            treemaker = 'parsnp'

        # Drop FASTA files into workdir
        retrieve_nas_files(seqids=seqids,
                           outdir=os.path.join(work_dir, 'fastas'),
                           filetype='fasta',
                           copyflag=False)
        # Run a mash to figure out if any strains are particularly far apart and likely to make PARSNP fail.
        reference_file = glob.glob(os.path.join(work_dir, 'fastas',
                                                '*.fasta'))[0]
        bad_fastas = check_distances(reference_file,
                                     os.path.join(work_dir, 'fastas'))
        if bad_fastas:
            outstr = ''
            for fasta in bad_fastas:
                fasta = os.path.split(fasta)[-1]
                outstr += fasta + '\n'
            redmine_instance.issue.update(
                resource_id=issue.id,
                notes=
                'Warning! MASH screening thinks that the following samples may be too'
                ' far from the reference: {samples}\nIn this case, the reference file'
                ' was {reference}. You may want to create a new issue and '
                'try again.'.format(
                    samples=outstr,
                    reference=os.path.split(reference_file)[-1]))

        # Remove distances.tab and sketch.msh from fastas folder, because sometimes they make
        # parsnp crash. Other times they don't. I have no idea why, so remove just to be safe.
        try:
            os.remove(os.path.join(work_dir, 'fastas', 'distances.tab'))
            os.remove(os.path.join(work_dir, 'fastas', 'sketch.msh'))
        except OSError:
            pass

        # Full paths needed here since SLURM doesn't give the $PATH of the host machine to the script for some reason
        if treemaker == 'parsnp':
            cmd = '/mnt/nas/Programs/Parsnp-Linux64-v1.2/parsnp -r ! -d {input} -o {output} -p {threads}'.format(
                input=os.path.join(work_dir, 'fastas'),
                output=os.path.join(work_dir, 'output'),
                threads=24)
        elif treemaker == 'mashtree':
            if not os.path.isdir(os.path.join(work_dir, 'output')):
                os.makedirs(os.path.join(work_dir, 'output'))
            cmd = '/home/ubuntu/bin/mashtree --numcpus 24 --outtree {output_newick} {input_fastas}'.format(
                output_newick=os.path.join(work_dir, 'output', 'parsnp.tree'),
                input_fastas=os.path.join(work_dir, 'fastas', '*.fasta'))
        returncode = subprocess.call(
            cmd,
            shell=True,
            env={'PERL5LIB': '$PERL5LIB:/home/ubuntu/lib/perl5'})
        if returncode != 0:
            raise Exception(
                'Tree creation command ({}) for {} had return code {}'.format(
                    cmd, issue.id, returncode))
        # Now use diversitree to pick the strains we actually want.
        # IMPORTANT NOTE TO ANYONE MAINTAINING THIS: Need to have xvfb installed on nodes in order to make this run.
        # StrainChoosr uses ete3 to draw trees, which uses PyQt, which needs some sort of display.
        cmd = 'xvfb-run strainchoosr --treefile {tree} --number {number} ' \
              '--output_name {output}'.format(tree=os.path.join(work_dir, 'output', 'parsnp.tree'),
                                              number=desired_num_strains,
                                              output=os.path.join(work_dir, 'diversitree_report'))
        returncode = subprocess.call(cmd, shell=True)
        if returncode != 0:
            raise Exception(
                'StrainChoosr command ({}) for {} had return code {}'.format(
                    cmd, issue.id, returncode))
        output_list = list()
        output_dict = dict()
        output_dict['path'] = os.path.join(work_dir, 'output', 'parsnp.tree')
        output_dict['filename'] = 'tree.nwk'
        output_list.append(output_dict)
        output_dict = dict()
        output_dict['path'] = os.path.join(work_dir, 'diversitree_report.html')
        output_dict['filename'] = 'diversitree_report.html'
        output_list.append(output_dict)
        redmine_instance.issue.update(resource_id=issue.id,
                                      uploads=output_list,
                                      status_id=4,
                                      notes='DiversiTree process complete!')
        os.system('rm {fasta_files}'.format(
            fasta_files=os.path.join(work_dir, '*fasta')))
    except Exception as e:
        redmine_instance.issue.update(
            resource_id=issue.id,
            notes=
            'Something went wrong! Send this error traceback to your friendly '
            'neighborhood bioinformatician: {}'.format(e))
コード例 #12
0
def merge_redmine(redmine_instance, issue, work_dir, description):
    # Unpickle Redmine objects
    sentry_sdk.init(SENTRY_DSN, before_send=before_send)
    redmine_instance = pickle.load(open(redmine_instance, 'rb'))
    issue = pickle.load(open(issue, 'rb'))
    description = pickle.load(open(description, 'rb'))

    try:
        # Download the attached excel file.
        # First, get the attachment id - this seems like a kind of hacky way to do this, but I have yet to figure
        # out a better way to do it.
        attachment = redmine_instance.issue.get(issue.id,
                                                include='attachments')
        attachment_id = 0
        for item in attachment.attachments:
            attachment_id = item.id

        # Now download, if attachment id is not 0, which indicates that we didn't find anything attached to the issue.
        if attachment_id != 0:
            attachment = redmine_instance.attachment.get(attachment_id)
            attachment.download(savepath=work_dir, filename='merge.xlsx')
        else:
            redmine_instance.issue.update(
                resource_id=issue.id,
                notes=
                'ERROR: Did not find any attached files. Please create a new issue with '
                'the merge excel file attached and try again.',
                status_id=4)
            return

        # Now use convert_excel_file to make compatible with merger.py
        convert_excel_file(os.path.join(work_dir, 'merge.xlsx'),
                           os.path.join(work_dir, 'Merge.xlsx'))

        # Make a SEQID list of files we'll need to extract.
        seqid_list = generate_seqid_list(os.path.join(work_dir, 'Merge.xlsx'))

        # Create links of all seqids needed in our working dir
        retrieve_nas_files(seqids=seqid_list,
                           outdir=work_dir,
                           filetype='fastq',
                           copyflag=False)

        merge_files(mergefile=os.path.join(work_dir, 'Merge.xlsx'),
                    work_dir=work_dir)
        # Run the merger script.
        # cmd = 'python /mnt/nas/Redmine/OLCRedmineAutomator/automators/merger.py -f {} -d ";" {}'.format(
        #     os.path.join(work_dir, 'Merge.xlsx'), work_dir)
        # os.system(cmd)

        # issue.watcher.add(226)  # Add Paul so he can put results into DB.
        # Make a folder to put all the merged FASTQs in biorequest folder. and put the merged FASTQs there.
        os.makedirs(os.path.join(work_dir, 'merged_' + str(issue.id)))
        cmd = 'mv {merged_files} {merged_folder}'.format(
            merged_files=os.path.join(work_dir, '*MER*.fastq.gz'),
            merged_folder=os.path.join(work_dir, 'merged_' + str(issue.id)))
        os.system(cmd)

        if len(
                glob.glob(
                    os.path.join(work_dir, 'merged_' + str(issue.id),
                                 '*fastq.gz'))) == 0:
            redmine_instance.issue.update(
                resource_id=issue.id,
                notes=
                'ERROR: Something went wrong, no merged FASTQ files were created.',
                status_id=4)
            return
        # Now copy those merged FASTQS to merge backup and the hdfs folder so they can be assembled.
        cmd = 'cp {merged_files} /mnt/nas2/raw_sequence_data/merged_sequences'.format(
            merged_files=os.path.join(work_dir, 'merged_' +
                                      str(issue.id), '*.fastq.gz'))
        os.system(cmd)

        cmd = 'cp -r {merged_folder} /hdfs'.format(
            merged_folder=os.path.join(work_dir, 'merged_' + str(issue.id)))
        os.system(cmd)

        redmine_instance.issue.update(
            resource_id=issue.id,
            notes=
            'Merged FASTQ files created, beginning assembly of merged files.')
        # With files copied over to the HDFS, start the assembly process (Now using new pipeline!)
        cmd = 'docker rm -f cowbat'
        os.system(cmd)
        cmd = 'docker run -i -u $(id -u) -v /mnt/nas2:/mnt/nas2 -v /hdfs:/hdfs --name cowbat --rm {cowbat_image} /bin/bash -c ' \
              '"source activate cowbat && assembly_pipeline.py -s {hdfs_folder} ' \
              '-r {cowbat_databases}"'.format(hdfs_folder=os.path.join('/hdfs', 'merged_' + str(issue.id)),
                                             cowbat_image=COWBAT_IMAGE,
                                             cowbat_databases=COWBAT_DATABASES)
        os.system(cmd)

        # Move results to merge_WGSspades, and upload the results folder to redmine.
        cmd = 'mv {hdfs_folder} {merge_WGSspades}'.format(
            hdfs_folder=os.path.join('/hdfs', 'merged_' + str(issue.id)),
            merge_WGSspades=os.path.join(
                '/mnt/nas2/processed_sequence_data/merged_assemblies',
                'merged_' + str(issue.id) + '_Assembled'))
        os.system(cmd)
        shutil.make_archive(
            os.path.join(work_dir, 'reports'), 'zip',
            os.path.join('/mnt/nas2/processed_sequence_data/merged_assemblies',
                         'merged_' + str(issue.id) + '_Assembled', 'reports'))
        output_list = list()
        output_dict = dict()
        output_dict['path'] = os.path.join(work_dir, 'reports.zip')
        output_dict['filename'] = 'merged_' + str(issue.id) + '_reports.zip'
        output_list.append(output_dict)
        redmine_instance.issue.update(
            resource_id=issue.id,
            uploads=output_list,
            status_id=4,
            notes='Merge Process Complete! Reports attached.')
    except Exception as e:
        sentry_sdk.capture_exception(e)
        redmine_instance.issue.update(
            resource_id=issue.id,
            notes=
            'Something went wrong! We log this automatically and will look into the '
            'problem and get back to you with a fix soon.')
コード例 #13
0
def externalretrieve_redmine(redmine_instance, issue, work_dir, description):
    print('External retrieving!')
    # Unpickle Redmine objects
    redmine_instance = pickle.load(open(redmine_instance, 'rb'))
    issue = pickle.load(open(issue, 'rb'))
    description = pickle.load(open(description, 'rb'))

    try:
        os.makedirs(os.path.join(work_dir, str(issue.id)))
        # Parse description to figure out what SEQIDs we need to run on.
        fasta_list = list()
        fastq_list = list()
        fasta = False
        fastq = True
        for item in description:
            item = item.upper()
            if 'FASTA' in item:
                fasta = True
                fastq = False
                continue
            if 'FASTQ' in item:
                fastq = True
                fasta = False
                continue
            if fasta:
                fasta_list.append(item)
            elif fastq:
                fastq_list.append(item)

        # Use NAStools to put FASTA and FASTQ files into our working dir.
        retrieve_nas_files(seqids=fasta_list,
                           outdir=os.path.join(work_dir, str(issue.id)),
                           filetype='fasta',
                           copyflag=True)

        retrieve_nas_files(seqids=fastq_list,
                           outdir=os.path.join(work_dir, str(issue.id)),
                           filetype='fastq',
                           copyflag=True)

        # Check that we got all the requested files.
        missing_fastas = check_fastas_present(
            fasta_list, os.path.join(work_dir, str(issue.id)))
        missing_fastqs = check_fastqs_present(
            fastq_list, os.path.join(work_dir, str(issue.id)))
        if len(missing_fastqs) > 0:
            redmine_instance.issue.update(
                resource_id=issue.id,
                notes=
                'WARNING: Could not find the following requested FASTQ SEQIDs on'
                ' the OLC NAS: {}'.format(missing_fastqs))

        if len(missing_fastas) > 0:
            redmine_instance.issue.update(
                resource_id=issue.id,
                notes=
                'WARNING: Could not find the following requested FASTA SEQIDs on'
                ' the OLC NAS: {}'.format(missing_fastas))

        # Now make a zip folder that we'll upload to the FTP.
        shutil.make_archive(root_dir=os.path.join(work_dir, str(issue.id)),
                            format='zip',
                            base_name=os.path.join(work_dir, str(issue.id)))

        # Now need to login to the FTP to upload the zipped folder.
        # Lots of FTP issues lately - in the event that upload does not work, a timeout will occur.
        # Allow for up to 10 attempts at uploading. If upload has completed and we stall at the end, allow.
        upload_successful = upload_to_ftp(
            local_file=os.path.join(work_dir,
                                    str(issue.id) + '.zip'))

        # And finally, do some file cleanup.
        try:
            shutil.rmtree(os.path.join(work_dir, str(issue.id)))
            os.remove(os.path.join(work_dir, str(issue.id) + '.zip'))
        except:
            pass

        if upload_successful is False:
            redmine_instance.issue.update(
                resource_id=issue.id,
                status_id=4,
                notes=
                'There are connection issues with the FTP site. Unable to complete '
                'external retrieve process. Please try again later.')
        else:
            redmine_instance.issue.update(
                resource_id=issue.id,
                status_id=4,
                notes='External Retrieve process complete!\n\n'
                'Results are available at the following FTP address:\n'
                'ftp://ftp.agr.gc.ca/outgoing/cfia-ak/{}'.format(
                    str(issue.id) + '.zip'))
    except Exception as e:
        redmine_instance.issue.update(
            resource_id=issue.id,
            notes=
            'Something went wrong! Send this error traceback to your friendly '
            'neighborhood bioinformatician: {}'.format(e))
コード例 #14
0
def neartree_redmine(redmine_instance, issue, work_dir, description):
    # Unpickle Redmine objects
    redmine_instance = pickle.load(open(redmine_instance, 'rb'))
    issue = pickle.load(open(issue, 'rb'))
    description = pickle.load(open(description, 'rb'))

    try:
        if not os.path.isdir(os.path.join(work_dir, 'fastas')):
            os.makedirs(os.path.join(work_dir, 'fastas'))
        # Check that the first line of the request is a number. If it isn't, tell author they goofed and give up.
        try:
            desired_num_strains = int(description[0])
        except ValueError:
            redmine_instance.issue.update(
                resource_id=issue.id,
                notes=
                'Error! The first line of your request must be the number of'
                ' strains you want picked from the tree.',
                status_id=4)
            return

        # Go through description to figure out what our query is and what the reference is.
        query = False
        reference = False
        query_list = list()
        seqid_list = list()
        for i in range(1, len(description)):
            item = description[i].upper()
            if item == '':
                continue
            if 'QUERY' in item:
                query = True
                reference = False
                continue
            elif 'REFERENCE' in item:
                reference = True
                query = False
                continue
            if query:
                query_list.append(item)
            elif reference:
                seqid_list.append(item)

        # Only allowed to have one query file - boot the user if they tried to specify too many queries.
        if len(query_list) > 1:
            redmine_instance.issue.update(
                resource_id=issue.id,
                notes=
                'ERROR: You specified {query_list_len} query files ({query_list}). Only'
                ' one query file is supposed to be specified. '
                'Please try again.'.format(query_list_len=len(query_list),
                                           query_list=query_list))

        # Drop FASTA files into workdir
        retrieve_nas_files(seqids=seqid_list,
                           outdir=os.path.join(work_dir, 'fastas'),
                           filetype='fasta',
                           copyflag=False)
        # Also retrieve the query file.
        retrieve_nas_files(seqids=query_list,
                           outdir=work_dir,
                           filetype='fasta',
                           copyflag=False)
        # TODO: Add check that specified files were able to be retrieved.
        # Run a mash to figure out if any strains are particularly far apart and likely to make PARSNP fail.
        reference_file = glob.glob(os.path.join(work_dir, '*.fasta'))[0]
        make_ref(reference_file, os.path.join(work_dir, 'reference.fasta'))
        bad_fastas = check_distances(reference_file,
                                     os.path.join(work_dir, 'fastas'))
        if bad_fastas:
            outstr = ''
            for fasta in bad_fastas:
                fasta = os.path.split(fasta)[-1]
                outstr += fasta + '\n'
            redmine_instance.issue.update(
                resource_id=issue.id,
                notes=
                'Warning! MASH screening thinks that the following samples may be too'
                ' far from the reference: {samples}\nIn this case, the reference file'
                ' was {reference}. You may want to create a new issue and '
                'try again.'.format(
                    samples=outstr,
                    reference=os.path.split(reference_file)[-1]))

        # Remove distances.tab and sketch.msh from fastas folder, because sometimes they make
        # parsnp crash. Other times they don't. I have no idea why, so remove just to be safe.
        try:
            os.remove(os.path.join(work_dir, 'fastas', 'distances.tab'))
            os.remove(os.path.join(work_dir, 'fastas', 'sketch.msh'))
        except OSError:
            pass

        cmd = '/mnt/nas/Programs/Parsnp-Linux64-v1.2/parsnp -r {workdir}/reference.fasta -d {input} ' \
              '-c -o {output} -p {threads}'.format(threads=48,
                                                   workdir=work_dir,
                                                   input=os.path.join(work_dir, 'fastas'),
                                                   output=os.path.join(work_dir, 'parsnp_output'))
        os.system(cmd)

        tree = Phylo.read(
            os.path.join(work_dir, 'parsnp_output', 'parsnp.tree'), 'newick')
        ref_clades = tree.find_clades('reference.fasta.ref')
        for clade in ref_clades:
            ref_clade = clade
        clades = tree.get_terminals()
        distance_dict = dict()
        for clade in clades:
            distance = tree.distance(clade, ref_clade)
            distance_dict[clade.name] = distance

        # Use some stackoverflow magic to sort dict https://stackoverflow.com/questions/613183/how-do-i-sort-a-dictionary-by-value
        sorted_dict = OrderedDict(
            sorted(distance_dict.items(), key=lambda x: x[1]))

        i = 0
        outstr = ''
        for key in sorted_dict:
            if 'reference' not in key and i < desired_num_strains:
                outstr += key.replace('.fasta', '') + '\n'
                i += 1

        redmine_instance.issue.update(
            resource_id=issue.id,
            status_id=4,
            notes='NearTree process complete! Closest strains are:\n {}'.
            format(outstr))
        os.system('rm {fasta_files}'.format(
            fasta_files=os.path.join(work_dir, 'fasta', '*fasta')))
    except Exception as e:
        redmine_instance.issue.update(
            resource_id=issue.id,
            notes=
            'Something went wrong! Send this error traceback to your friendly '
            'neighborhood bioinformatician: {}'.format(e))
コード例 #15
0
def sipprverse_redmine(redmine_instance, issue, work_dir, description):
    sentry_sdk.init(SENTRY_DSN, before_send=before_send)
    # Unpickle Redmine objects
    redmine_instance = pickle.load(open(redmine_instance, 'rb'))
    issue = pickle.load(open(issue, 'rb'))
    description = pickle.load(open(description, 'rb'))
    # Current list of analysis types that the sipprverse can perform
    analyses = [
        'custom', 'full', 'gdcs', 'genesippr', 'mash', 'mlst', 'pointfinder',
        'resfinder', 'rmlst', 'serosippr', 'sixteens', 'virulence'
    ]
    # Dictionary of analysis types to argument flags to pass to the script
    argument_flags = {
        'custom':
        '-U {fasta}'.format(fasta=os.path.join(work_dir, 'targets.tfa')),
        'gdcs': '-Q',
        'genesippr': '-G',
        'mash': '-C',
        'mlst': '-M',
        'pointfinder': '-P',
        'resfinder': '-A',
        'rmlst': '-R',
        'serosippr': '-S',
        'sixteens': '-X',
        'virulence': '-V',
        'full': '-F'
    }
    # Variable to hold supplied arguments
    argument_dict = {
        'analysis': str(),
        'averagedepth': 2,
        'kmersize': 19,
        'cutoff': 0.90,
        'allowsoftclips': False,
    }
    try:
        # Parse description to figure out what SEQIDs we need to run on.
        seqids = list()
        for item in description:
            item = item.upper().rstrip()
            if 'AVERAGEDEPTH' in item:
                argument_dict['averagedepth'] = int(item.split('=')[1].lower())
                continue
            if 'CUTOFF' in item:
                argument_dict['cutoff'] = float(item.split('=')[1].lower())
                continue
            if 'KMERSIZE' in item:
                argument_dict['kmersize'] = int(item.split('=')[1].lower())
                continue
            if 'ANALYSIS' in item:
                argument_dict['analysis'] = item.split('=')[1].lower()
                continue
            if 'ALLOWSOFTCLIPS' in item:
                argument_dict['allowsoftclips'] = True
                continue
            # Otherwise the item should be a SEQID
            seqids.append(item)
        if argument_dict['analysis'] == 'custom':
            # Download the attached FASTA file.
            # First, get the attachment id - this seems like a kind of hacky way to do this, but I have yet to figure
            # out a better way to do it.
            attachment = redmine_instance.issue.get(issue.id,
                                                    include='attachments')
            attachment_id = 0
            for item in attachment.attachments:
                attachment_id = item.id
            # Set the name of and create the folder to store the targets
            dbpath = work_dir
            # Download if attachment id is not 0, which indicates that we didn't find anything attached to the issue.
            if attachment_id != 0:
                attachment = redmine_instance.attachment.get(attachment_id)
                attachment.download(savepath=work_dir, filename='targets.tfa')
            else:
                redmine_instance.issue.update(
                    resource_id=issue.id,
                    notes=
                    'ERROR: Analysis type custom requires an attached FASTA file of '
                    'targets. The automator could not find any attached files. '
                    'Please create a new issue with the FASTA file attached and try '
                    'again.',
                    status_id=4)
                return
        else:
            dbpath = COWBAT_DATABASES
        # Ensure that the analysis type is provided
        if not argument_dict['analysis']:
            redmine_instance.issue.update(
                resource_id=issue.id,
                notes='WARNING: No analysis type provided. '
                'Please ensure that issue contains '
                '"analysistype=requested_analysis_type", where requested_analysis_type '
                ' is one of the following keywords: {ats}. See the the usage guide: '
                'https://olc-bioinformatics.github.io/redmine-docs/analysis/sipprverse/'
                ' for additional details'.format(ats=','.join(analyses)))
            return
        # Ensure that the supplied analysis type is valid
        if argument_dict['analysis'] not in analyses:
            redmine_instance.issue \
                .update(resource_id=issue.id,
                        notes='WARNING: Requested analysis type: {at} not in list of supported analyses: {ats}. Please '
                              'see https://olc-bioinformatics.github.io/redmine-docs/analysis/sipprverse/ '
                              'for additional details'.format(at=argument_dict['analysis'],
                                                              ats=','.join(analyses)))
            return
        # Ensure that SEQIDs were included
        if not seqids:
            redmine_instance.issue.update(resource_id=issue.id,
                                          notes='WARNING: No SEQIDs provided!')
        # Run file linker and then make sure that all FASTQ files requested are present. Warn user if they
        # requested things that we don't have.
        retrieve_nas_files(seqids=seqids,
                           outdir=work_dir,
                           filetype='fastq',
                           copyflag=False)
        missing_fastqs = verify_fastq_files_present(seqids, work_dir)
        # Update the Redmine issue if one or more of the requested SEQIDs could not be located
        if missing_fastqs:
            redmine_instance.issue.update(
                resource_id=issue.id,
                notes='WARNING: Could not find the following requested SEQIDs on'
                ' the OLC NAS: {}'.format(missing_fastqs))
        # These unfortunate hard coded paths appear to be necessary
        activate = 'source /home/ubuntu/miniconda3/bin/activate /mnt/nas2/virtual_environments/sipprverse'
        sippr_py = '/mnt/nas2/virtual_environments/sipprverse/bin/sippr.py'
        # Run sipprverse with the necessary arguments
        sippr_cmd = 'python {sippr_py} -s {seqpath} -o {outpath} -r {dbpath} -a {ad} -k {ks} -c {cut} {at}'\
            .format(sippr_py=sippr_py,
                    seqpath=work_dir,
                    outpath=work_dir,
                    dbpath=dbpath,
                    ad=argument_dict['averagedepth'],
                    ks=argument_dict['kmersize'],
                    cut=argument_dict['cutoff'],
                    at=argument_flags[argument_dict['analysis']])
        # Add the allow_soft_clips option if required
        sippr_cmd += ' -sc' if argument_dict['allowsoftclips'] else ''
        redmine_instance.issue.update(
            resource_id=issue.id,
            notes='Sipprverse command:\n {cmd}'.format(cmd=sippr_cmd))
        # Create another shell script to execute within the conda environment
        template = "#!/bin/bash\n{} && {}".format(activate, sippr_cmd)
        sipprverse_script = os.path.join(work_dir, 'run_sipprverse.sh')
        with open(sipprverse_script, 'w+') as file:
            file.write(template)
        # Modify the permissions of the script to allow it to be run on the node
        make_executable(sipprverse_script)
        # Run shell script
        os.system(sipprverse_script)

        # Zip output
        output_filename = 'sipprverse_output'
        zip_filepath = zip_folder(results_path=os.path.join(
            work_dir, 'reports'),
                                  output_dir=work_dir,
                                  output_filename=output_filename)
        zip_filepath += '.zip'
        # Prepare upload
        output_list = [{
            'filename': os.path.basename(zip_filepath),
            'path': zip_filepath
        }]

        # Create a list of all the folders - will be used to clean up the working directory
        folders = glob.glob(os.path.join(work_dir, '*/'))
        # Remove all the folders
        for folder in folders:
            if os.path.isdir(folder):
                shutil.rmtree(folder)
        # Wrap up issue
        redmine_instance.issue.update(
            resource_id=issue.id,
            uploads=output_list,
            status_id=4,
            notes='{at} analysis with sipprverse complete!'.format(
                at=argument_dict['analysis'].lower()))
    except Exception as e:
        sentry_sdk.capture_exception(e)
        redmine_instance.issue.update(
            resource_id=issue.id,
            notes=
            'Something went wrong! We log this automatically and will look into the '
            'problem and get back to you with a fix soon.')
コード例 #16
0
def clark_redmine(redmine_instance, issue, work_dir, description):
    sentry_sdk.init(SENTRY_DSN, before_send=before_send)
    # Unpickle Redmine objects
    redmine_instance = pickle.load(open(redmine_instance, 'rb'))
    issue = pickle.load(open(issue, 'rb'))
    description = pickle.load(open(description, 'rb'))

    try:
        # Parse description to figure out what SEQIDs we need to run on.
        seqids = list()
        fasta = False
        for item in description:
            item = item.upper()
            if 'FASTA' in item:
                fasta = True
                continue
            seqids.append(item)

        # Write SEQIDs to file to be extracted and CLARKed.
        with open(os.path.join(work_dir, 'seqid.txt'), 'w') as f:
            for seqid in seqids:
                f.write(seqid + '\n')

        # If FASTQ, run file linker and then make sure that all FASTQ files requested are present. Warn user if they
        # requested things that we don't have.
        if not fasta:
            retrieve_nas_files(seqids=seqids,
                               outdir=work_dir,
                               filetype='fastq',
                               copyflag=False)
            # current_dir = os.getcwd()
            # os.chdir('/mnt/nas/MiSeq_Backup')
            # cmd = 'python2 /mnt/nas/MiSeq_Backup/file_linker.py {}/seqid.txt {}'.format(work_dir, work_dir)
            # os.system(cmd)
            # os.chdir(current_dir)
            missing_fastqs = verify_fastq_files_present(seqids, work_dir)
            if missing_fastqs:
                redmine_instance.issue.update(
                    resource_id=issue.id,
                    notes=
                    'WARNING: Could not find the following requested SEQIDs on'
                    ' the OLC NAS: {}'.format(missing_fastqs))

        # If it's FASTA, extract them and make sure all are present.
        if fasta:
            retrieve_nas_files(seqids=seqids,
                               outdir=work_dir,
                               filetype='fasta',
                               copyflag=False)
            # cmd = 'python2 /mnt/nas/WGSspades/file_extractor.py {}/seqid.txt {} /mnt/nas/'.format(work_dir, work_dir)
            # os.system(cmd)
            missing_fastas = verify_fasta_files_present(seqids, work_dir)
            if missing_fastas:
                redmine_instance.issue.update(
                    resource_id=issue.id,
                    notes=
                    'WARNING: Could not find the following requested SEQIDs on'
                    ' the OLC NAS: {}'.format(missing_fastas))

        # Run CLARK for classification.
        cmd = 'python -m metagenomefilter.automateCLARK -s {} -d /mnt/nas2/databases/assemblydatabases/0.3.2/clark/ ' \
              '-C /home/ubuntu/Programs/CLARKSCV1.2.3.2/ {}\n'.format(work_dir, work_dir)
        os.system(cmd)

        # Get the output file uploaded.
        output_list = list()
        output_dict = dict()
        output_dict['path'] = os.path.join(work_dir, 'reports',
                                           'abundance.xlsx')
        output_dict['filename'] = 'abundance.xlsx'
        output_list.append(output_dict)
        redmine_instance.issue.update(resource_id=issue.id,
                                      uploads=output_list,
                                      status_id=4,
                                      notes='AutoCLARK process complete!')

        # Clean up all FASTA/FASTQ files so we don't take up too
        os.system(
            'rm {workdir}/*fasta {workdir}/*fastq*'.format(workdir=work_dir))
    except Exception as e:
        sentry_sdk.capture_exception(e)
        redmine_instance.issue.update(
            resource_id=issue.id,
            notes=
            'Something went wrong! We log this automatically and will look into the '
            'problem and get back to you with a fix soon.')
コード例 #17
0
def staramr_redmine(redmine_instance, issue, work_dir, description):
    # Unpickle Redmine objects
    redmine_instance = pickle.load(open(redmine_instance, 'rb'))
    issue = pickle.load(open(issue, 'rb'))
    description = pickle.load(open(description, 'rb'))
    # Parse description to get list of SeqIDs
    seqids = list()
    for i in range(0, len(description)):
        item = description[i]
        item = item.upper()
        # Minimal check to make sure IDs provided somewhat resemble a valid sample ID
        if item.isalpha():
            pass
        else:
            seqids.append(item)

    # Run Mash
    with open(os.path.join(work_dir, 'seqid.txt'), 'w') as f:
        for seqid in seqids:
            f.write(seqid + '\n')
    # Drop FASTA files into workdir
    retrieve_nas_files(seqids=seqids,
                       outdir=work_dir,
                       filetype='fasta',
                       copyflag=False)
    # Create output directory
    output_dir = os.path.join(work_dir, 'output')
    make_path(output_dir)
    # Get all of the FASTA files
    fasta_list = sorted(glob.glob(os.path.join(work_dir, '*.fasta')))
    # Set the folder to store all the PointFinder outputs
    staramr_output_dir = os.path.join(work_dir, 'staramr_outputs')
    # Initialise a dictionaries to store the mash-calculated, and pointfinder-formatted genus outputs for each strain
    genus_dict = dict()
    organism_dict = dict()
    # Create lists to store missing and unprocessed seqids
    unprocessed_seqs = list()
    missing_seqs = list()
    mash_fails = list()
    # Dictionary to convert the mash-calculated genus to the pointfinder format
    pointfinder_org_dict = {
        'Campylobacter': 'campylobacter',
        'Escherichia': 'e.coli',
        'Shigella': 'e.coli',
        '‎Mycobacterium': 'tuberculosis',
        'Neisseria': 'gonorrhoeae',
        'Salmonella': 'salmonella'
    }
    # Reverse look-up dictionary
    rev_org_dict = {
        'campylobacter': 'Campylobacter',
        'e.coli': 'Escherichia',
        'tuberculosis': 'Mycobacterium',
        'gonorrhoeae': 'Neisseria',
        'salmonella': 'Salmonella'
    }

    # Run mash screen on each of the assemblies
    for item in fasta_list:
        seqid = os.path.splitext(os.path.basename(item))[0]
        screen_file = os.path.join(output_dir,
                                   '{seqid}_screen.tab'.format(seqid=seqid))
        mash.screen('/mnt/nas2/databases/confindr/databases/refseq.msh',
                    item,
                    threads=8,
                    w='',
                    i='0.95',
                    output_file=screen_file,
                    returncmd=True)
        screen_output = mash.read_mash_screen(screen_file)
        # Determine the genus from the screen output file
        for screen in screen_output:
            # Extract the genus from the mash results
            mash_organism = screen.query_id.split('/')[-3]
            # Use the organism as a key in the pointfinder database name conversion dictionary
            try:
                mash_genus = pointfinder_org_dict[mash_organism]
            except KeyError:
                mash_genus = 'NA'
            # Populate the dictionaries with the seqid, and the calculated genus/pointfinder name
            genus_dict[seqid] = mash_genus
            organism_dict[seqid] = mash_organism
    # Delete all of the FASTA files
    for fasta in fasta_list:
        os.remove(fasta)
    # # Delete the output folder
    # shutil.rmtree(output_dir)

    # Pointfinder
    # These unfortunate hard coded paths appear to be necessary
    activate = 'source /home/ubuntu/miniconda3/bin/activate /mnt/nas2/virtual_environments/staramr'
    staramr_py = '/mnt/nas2/virtual_environments/staramr/bin/staramr'
    # List of organisms in the pointfinder database
    staramr_list = ['campylobacter', 'salmonella']
    try:
        os.mkdir(staramr_output_dir)
    except FileExistsError:
        pass

    genus_seqid_dict = dict()
    for seqid in sorted(seqids):
        try:
            seqid_genus = genus_dict[seqid]
            if seqid_genus not in genus_seqid_dict:
                genus_seqid_dict[seqid_genus] = [seqid]
            else:
                genus_seqid_dict[seqid_genus].append(seqid)
        except KeyError:  # Mash sometimes doesn't find a genus!
            mash_fails.append(seqid)

    for genus in genus_seqid_dict:
        if genus in staramr_list:
            assembly_folder = os.path.join(work_dir, genus)
            make_path(assembly_folder)
            retrieve_nas_files(seqids=genus_seqid_dict[genus],
                               outdir=assembly_folder,
                               filetype='fasta',
                               copyflag=False)
            fastas = sorted(glob.glob(os.path.join(assembly_folder,
                                                   '*.fasta')))
            outdir = os.path.join(staramr_output_dir, genus)
            cmd = '{py} search --pointfinder-organism {orgn} -o {output} ' \
                .format(py=staramr_py,
                        orgn=genus,
                        output=outdir,
                        )
            for fasta in fastas:
                cmd += fasta + ' '
            # Create another shell script to execute within the PlasmidExtractor conda environment
            template = "#!/bin/bash\n{} && {}".format(activate, cmd)
            pointfinder_script = os.path.join(work_dir, 'run_staramr.sh')
            with open(pointfinder_script, 'w+') as f:
                f.write(template)
            # Modify the permissions of the script to allow it to be run on the node
            make_executable(pointfinder_script)
            # Run shell script
            os.system(pointfinder_script)
        else:
            for seqid in genus_seqid_dict[genus]:
                unprocessed_seqs.append(seqid)

    # Zip output
    output_filename = 'staramr_output'
    zip_filepath = zip_folder(results_path=staramr_output_dir,
                              output_dir=work_dir,
                              output_filename=output_filename)
    zip_filepath += '.zip'
    # Prepare upload
    output_list = [{
        'filename': os.path.basename(zip_filepath),
        'path': zip_filepath
    }]

    # Create a note to add to the updated Redmine issue
    notes = 'StarAMR process complete!'
    # If there are missing, or unprocessed sequences, add details to the note
    if unprocessed_seqs:
        seq_list = list()
        for sequence in unprocessed_seqs:
            seq_list.append('{seqid} ({organism})'.format(
                seqid=sequence, organism=organism_dict[sequence]))
        if len(unprocessed_seqs) > 1:
            notes += '\n The following sequences were not processed, as they were determined to be genera not ' \
                     'present in the StarAMR database: {seqs}'.format(seqs=', '.join(seq_list))
        else:
            notes += '\n The following sequence was not processed, as it was determined to be a genus not ' \
                     'present in the StarAMR database: {seqs}'.format(seqs=', '.join(seq_list))
    if missing_seqs:
        if len(missing_seqs) > 1:
            notes += '\n The following sequences were not processed, as they could not be located in the strain ' \
                     'database: {seqs}'.format(seqs=', '.join(missing_seqs))
        else:
            notes += '\n The following sequence was not processed, as it could not be located in the strain database:' \
                     ' {seqs}'.format(seqs=', '.join(missing_seqs))
    if mash_fails:
        if len(mash_fails) > 1:
            notes += '\n The following sequences could not be processed by MASH screen: {seqs}'\
                .format(seqs=', '.join(mash_fails))
        else:
            notes += '\n The following sequence could not be processed by MASH screen: {seqs}'\
                .format(seqs=', '.join(mash_fails))
    # Create a list of all the folders - will be used to clean up the working directory
    folders = glob.glob(os.path.join(work_dir, '*/'))
    # Remove all the folders
    for folder in folders:
        if os.path.isdir(folder):
            shutil.rmtree(folder)
    # Wrap up issue
    redmine_instance.issue.update(resource_id=issue.id,
                                  uploads=output_list,
                                  status_id=4,
                                  notes=notes)
コード例 #18
0
def confindr_redmine(redmine_instance, issue, work_dir, description):
    # Unpickle Redmine objects
    redmine_instance = pickle.load(open(redmine_instance, 'rb'))
    issue = pickle.load(open(issue, 'rb'))
    description = pickle.load(open(description, 'rb'))

    # Parse description to get list of SeqIDs
    seqids = []
    for i in range(0, len(description)):
        item = description[i]
        item = item.upper()

        # Minimal check to make sure IDs provided somewhat resemble a valid sample ID
        if item.isalpha():
            pass
        else:
            seqids.append(item)

    # Create folder to drop FASTQ files
    raw_reads_folder = os.path.join(work_dir, 'raw_reads')
    os.mkdir(raw_reads_folder)

    # Create output folder
    output_folder = os.path.join(work_dir, 'output')
    os.mkdir(output_folder)

    # Extract FASTQ files.
    retrieve_nas_files(seqids=seqids, outdir=raw_reads_folder, filetype='fastq', copyflag=False)

    # These unfortunate hard coded paths appear to be necessary
    activate = 'source /home/ubuntu/miniconda3/bin/activate /mnt/nas2/virtual_environments/confindr'
    confindr_py = '/mnt/nas2/virtual_environments/confindr/bin/confindr.py'

    # Database locations
    confindr_db = '/mnt/nas2/databases/confindr/databases/'

    # Prepare command
    cmd = '{confindr_py} ' \
          '-i {raw_reads_folder} ' \
          '-o {output_folder} ' \
          '-d {confindr_db}'.format(confindr_py=confindr_py,
                                    raw_reads_folder=raw_reads_folder,
                                    output_folder=output_folder,
                                    confindr_db=confindr_db)

    # Create another shell script to execute within the PlasmidExtractor conda environment
    template = "#!/bin/bash\n{} && {}".format(activate, cmd)
    confindr_script = os.path.join(work_dir, 'run_confindr.sh')
    with open(confindr_script, 'w+') as file:
        file.write(template)
    make_executable(confindr_script)

    # Run shell script
    os.system(confindr_script)

    # Zip output
    output_filename = 'confindr_output'
    zip_filepath = zip_folder(results_path=output_folder,
                              output_dir=work_dir,
                              output_filename=output_filename)
    zip_filepath += '.zip'

    # Prepare upload
    output_list = [
        {
            'filename': os.path.basename(zip_filepath),
            'path': zip_filepath
        }
    ]

    # Wrap up issue
    redmine_instance.issue.update(resource_id=issue.id, uploads=output_list, status_id=4,
                                  notes='ConFindr process complete!')
コード例 #19
0
def externalretrieve_redmine(redmine_instance, issue, work_dir, description):
    print('SRA upload')
    # Unpickle Redmine objects
    redmine_instance = pickle.load(open(redmine_instance, 'rb'))
    issue = pickle.load(open(issue, 'rb'))
    description = pickle.load(open(description, 'rb'))

    try:
        # Parse description to figure out what SEQIDs we need to run on.
        ftp_user = description.pop(0).rstrip()
        ftp_pass = description.pop(0).rstrip()
        ftp_folder = description.pop(0).rstrip()
        fastq_list = list()
        for item in description:
            item = item.upper()
            fastq_list.append(item)

        # Use NAStools to put FASTQ files into our working dir.
        retrieve_nas_files(seqids=fastq_list,
                           outdir=work_dir,
                           filetype='fastq',
                           copyflag=True)

        # Check that we got all the requested files.
        missing_fastqs = check_fastqs_present(fastq_list, work_dir)
        if len(missing_fastqs) > 0:
            redmine_instance.issue.update(
                resource_id=issue.id,
                notes=
                'WARNING: Could not find the following requested FASTQ SEQIDs on'
                ' the OLC NAS: {}'.format(missing_fastqs))

        # Rename files to _R1 _R2, without anything else
        renamed_fastqs = rename_files(fastq_dir=work_dir)

        # Now need to login to the FTP to upload the zipped folder.
        try:
            s = ftplib.FTP('ftp-private.ncbi.nlm.nih.gov',
                           user=ftp_user,
                           passwd=ftp_pass)
        except ftplib.error_perm:
            redmine_instance.issue.update(
                resource_id=issue.id,
                status_id=4,
                notes=
                'ERROR: Could not connect to the NCBI FTP site with the username '
                'and password provided. Make sure the username provided to you is the '
                'first line of the description, and the password is the second.'
            )
            return

        try:
            s.cwd(ftp_folder)
        except ftplib.error_perm:
            redmine_instance.issue.update(
                resource_id=issue.id,
                status_id=4,
                notes=
                'ERROR: Could not find the directory you specified on the NCBI FTP '
                'site. Make sure the directory is specified on the third line of the '
                'description.')
            return
        s.mkd(str(issue.id))
        s.cwd(str(issue.id))
        for fastq in renamed_fastqs:
            f = open(fastq, 'rb')
            s.storbinary('STOR {}'.format(os.path.split(fastq)[1]), f)
            f.close()
        s.quit()

        # Finally, do some file cleanup.
        try:
            os.system('rm {}/*.fastq.gz'.format(work_dir))
        except:
            pass

        redmine_instance.issue.update(
            resource_id=issue.id,
            status_id=4,
            notes=
            'SRA Retrieve process complete! You should now be able to select the FTP '
            'folder called {} when prompted for your SRA submission.\n\n'.
            format(issue.id))
    except Exception as e:
        redmine_instance.issue.update(
            resource_id=issue.id,
            notes=
            'Something went wrong! Send this error traceback to your friendly '
            'neighborhood bioinformatician: {}'.format(e))
コード例 #20
0
def geneseekr_redmine(redmine_instance, issue, work_dir, description):
    sentry_sdk.init(SENTRY_DSN, before_send=before_send)
    # Unpickle Redmine objects
    redmine_instance = pickle.load(open(redmine_instance, 'rb'))
    issue = pickle.load(open(issue, 'rb'))
    description = pickle.load(open(description, 'rb'))
    # Current list of analysis types that the GeneSeekr can perform
    analyses = [
        'custom', 'gdcs', 'genesippr', 'mlst', 'resfinder', 'rmlst',
        'serosippr', 'sixteens', 'virulence'
    ]
    # Current BLAST analyses supported
    blasts = ['blastn', 'blastp', 'blastx', 'tblastn', 'tblastx']
    # Variable to hold supplied arguments
    argument_dict = {
        'analysis': str(),
        'align': False,
        'blast': 'blastn',
        'cutoff': 70,
        'evalue': '1E-5',
        'unique': False,
        'organism': str(),
        'fasta': False,
    }
    # Dictionary of analysis types to argument flags to pass to the script
    argument_flags = {
        'custom': '',
        'gdcs': '-Q',
        'genesippr': '-G',
        'mlst': '-M',
        'resfinder': '-A',
        'rmlst': '-R',
        'serosippr': '-S',
        'sixteens': '-X',
        'virulence': '-V'
    }
    # Set the database path for the analyses
    dbpath = COWBAT_DATABASES
    database_path = {
        'custom': os.path.join(work_dir, 'targets'),
        'gdcs': os.path.join(dbpath, 'GDCS'),
        'genesippr': os.path.join(dbpath, 'genesippr'),
        'mlst': os.path.join(dbpath, 'MLST'),
        'resfinder': os.path.join(dbpath, 'resfinder'),
        'rmlst': os.path.join(dbpath, 'rMLST'),
        'serosippr': os.path.join(dbpath, 'serosippr', 'Escherichia'),
        'sixteens': os.path.join(dbpath, 'sixteens_full'),
        'virulence': os.path.join(dbpath, 'virulence'),
    }
    try:
        # Parse description to figure out what SEQIDs we need to run on.
        seqids = list()
        for item in description:
            item = item.upper().rstrip()
            if 'ALIGN' in item:
                argument_dict['align'] = True
                continue
            if 'BLAST' in item:
                argument_dict['blast'] = item.split('=')[1].lower()
                continue
            if 'CUTOFF' in item:
                argument_dict['cutoff'] = int(item.split('=')[1].lower())
                continue
            if 'EVALUE' in item:
                argument_dict['evalue'] = item.split('=')[1].lower()
                continue
            if 'UNIQUE' in item:
                argument_dict['unique'] = item.split('=')[1].lower()
                continue
            if 'ORGANISM' in item:
                argument_dict['organism'] = item.split('=')[1].capitalize()
            if 'ANALYSIS' in item:
                argument_dict['analysis'] = item.split('=')[1].lower()
                continue
            if 'FASTA' in item:
                argument_dict['fasta'] = True
                continue
            # Otherwise the item should be a SEQID
            seqids.append(item)
        if argument_dict['analysis'] == 'custom':
            # Set and create the directory to store the custom targets
            target_dir = os.path.join(work_dir, 'targets')
            try:
                os.mkdir(target_dir)
            except FileExistsError:
                pass
            # Download the attached FASTA file.
            # First, get the attachment id - this seems like a kind of hacky way to do this, but I have yet to figure
            # out a better way to do it.
            attachment = redmine_instance.issue.get(issue.id,
                                                    include='attachments')
            attachment_id = 0
            for item in attachment.attachments:
                attachment_id = item.id
            # Download if attachment id is not 0, which indicates that we didn't find anything attached to the issue.
            if attachment_id != 0:
                attachment = redmine_instance.attachment.get(attachment_id)
                attachment.download(savepath=target_dir,
                                    filename='targets.tfa')
            else:
                redmine_instance.issue.update(
                    resource_id=issue.id,
                    notes=
                    'ERROR: Analysis type custom requires an attached FASTA file of '
                    'targets. The automator could not find any attached files. '
                    'Please create a new issue with the FASTA file attached and try '
                    'again.',
                    status_id=4)
                return
        # Ensure that the organism has been provided for organism-specific analyses
        if argument_dict['analysis'] == 'gdcs' or argument_dict[
                'analysis'] == 'mlst':
            if not argument_dict['organism']:
                redmine_instance.issue.update(
                    resource_id=issue.id,
                    notes=
                    'ERROR: Analysis type {at} requires the genus to be used for the '
                    'analyses. Please create a new issue with organism=ORGANISM '
                    'included in the issue.'.format(
                        at=argument_dict['analysis']),
                    status_id=4)
                return
            else:
                database_path[argument_dict['analysis']] = os.path.join(
                    database_path[argument_dict['analysis']],
                    argument_dict['organism'])
        # Ensure that the analysis type is provided
        if not argument_dict['analysis']:
            redmine_instance.issue.update(
                resource_id=issue.id,
                notes='WARNING: Could not identify an analysis type. '
                'Please ensure that the first line of the issue contains one'
                ' of the following keywords: {ats}'.format(
                    ats=', '.join(analyses)),
                status_id=4)
            return
        elif argument_dict['analysis'] not in analyses:
            redmine_instance.issue.update(
                resource_id=issue.id,
                notes=
                'WARNING: supplied analysis type {at} current not in the supported '
                'list of analyses: {ats}'.format(at=argument_dict['analysis'],
                                                 ats=', '.join(analyses)),
                status_id=4)
            return
        # Ensure that the requested BLAST analysis is valid
        if argument_dict['blast'] not in blasts:
            redmine_instance.issue.update(
                resource_id=issue.id,
                notes=
                'WARNING: requested BLAST analysis, {bt}, is not one of the currently '
                'supported analyses: {blasts}'.format(
                    bt=argument_dict['blast'], blasts=', '.join(blasts)),
                status_id=4)
            return
        # Ensure that SEQIDs were included
        if not seqids:
            redmine_instance.issue.update(resource_id=issue.id,
                                          notes='WARNING: No SEQIDs provided!',
                                          status_id=4)
            return
        # Run file linker and then make sure that all FASTA files requested are present. Warn user if they
        # requested things that we don't have.
        retrieve_nas_files(seqids=seqids,
                           outdir=work_dir,
                           filetype='fasta',
                           copyflag=False)
        missing_fastas = verify_fasta_files_present(seqids, work_dir)
        # Update the Redmine issue if one or more of the requested SEQIDs could not be located
        if missing_fastas:
            redmine_instance.issue.update(
                resource_id=issue.id,
                notes='WARNING: Could not find the following requested SEQIDs on'
                ' the OLC NAS: {}'.format(missing_fastas))

        # These unfortunate hard coded paths appear to be necessary
        activate = 'source /home/ubuntu/miniconda3/bin/activate /mnt/nas2/virtual_environments/geneseekr'
        seekr_py = '/mnt/nas2/virtual_environments/geneseekr/bin/GeneSeekr'
        # Run sipprverse with the necessary arguments
        seekr_cmd = 'python {seekr_py} {blast} -s {seqpath} -r {outpath} -t {dbpath} -c {cutoff} -e {evalue} {atf}'\
            .format(seekr_py=seekr_py,
                    blast=argument_dict['blast'],
                    seqpath=work_dir,
                    outpath=os.path.join(work_dir, 'reports'),
                    dbpath=database_path[argument_dict['analysis']],
                    cutoff=argument_dict['cutoff'],
                    evalue=argument_dict['evalue'],
                    atf=argument_flags[argument_dict['analysis']])
        # Append the align and/or the unique flags are required
        seekr_cmd += ' -a' if argument_dict['align'] else ''
        seekr_cmd += ' -u' if argument_dict['unique'] else ''
        seekr_cmd += ' -f' if argument_dict['fasta'] else ''
        # Update the issue with the GeneSeekr command
        redmine_instance.issue.update(
            resource_id=issue.id,
            notes='GeneSeekr command:\n {cmd}'.format(cmd=seekr_cmd))
        # Create another shell script to execute within the PlasmidExtractor conda environment
        template = "#!/bin/bash\n{} && {}".format(activate, seekr_cmd)
        geneseekr_script = os.path.join(work_dir, 'run_geneseekr.sh')
        with open(geneseekr_script, 'w+') as file:
            file.write(template)
        # Modify the permissions of the script to allow it to be run on the node
        make_executable(geneseekr_script)
        # Run shell script
        os.system(geneseekr_script)

        # Zip output
        output_filename = 'geneseekr_output'
        zip_filepath = zip_folder(results_path=os.path.join(
            work_dir, 'reports'),
                                  output_dir=work_dir,
                                  output_filename=output_filename)
        zip_filepath += '.zip'
        # Prepare upload
        output_list = [{
            'filename': os.path.basename(zip_filepath),
            'path': zip_filepath
        }]

        # Create a list of all the folders - will be used to clean up the working directory
        folders = glob.glob(os.path.join(work_dir, '*/'))
        # Remove all the folders
        for folder in folders:
            if os.path.isdir(folder):
                shutil.rmtree(folder)
        # Wrap up issue
        redmine_instance.issue.update(
            resource_id=issue.id,
            uploads=output_list,
            status_id=4,
            notes='{at} analysis with GeneSeekr complete!'.format(
                at=argument_dict['analysis'].lower()))
    except Exception as e:
        sentry_sdk.capture_exception(e)
        redmine_instance.issue.update(
            resource_id=issue.id,
            notes=
            'Something went wrong! We log this automatically and will look into the '
            'problem and get back to you with a fix soon.')
コード例 #21
0
def psortb_redmine(redmine_instance, issue, work_dir, description):
    try:
        # Unpickle Redmine objects
        redmine_instance = pickle.load(open(redmine_instance, 'rb'))
        issue = pickle.load(open(issue, 'rb'))
        description = pickle.load(open(description, 'rb'))
        # Parse description to get list of SeqIDs
        seqids = list()
        for i in range(0, len(description)):
            item = description[i].rstrip()
            item = item.upper()
            seqids.append(item)

        assemblies_folder = os.path.join(work_dir, 'assemblies')
        retrieve_nas_files(seqids=seqids,
                           outdir=assemblies_folder,
                           filetype='fasta',
                           copyflag=False)
        missing_fastas = check_fastas_present(seqids, assemblies_folder)
        if len(missing_fastas) > 0:
            redmine_instance.issue.update(resource_id=issue.id,
                                          notes='WARNING: Could not find the following requested FASTA SEQIDs on'
                                                ' the OLC NAS: {}'.format(missing_fastas))
        for fasta in missing_fastas:
            seqids.remove(fasta)
        # Steps to follow here:
        # 1) Generate protein file for sequence(s) of interest - run prokka so proteins get named nicely.

        prokka_folder = os.path.join(work_dir, 'prokka')
        os.makedirs(prokka_folder)
        # These unfortunate hard coded paths appear to be necessary
        activate = 'source /home/ubuntu/miniconda3/bin/activate /mnt/nas2/virtual_environments/prokka'
        prokka = '/mnt/nas2/virtual_environments/prokka/bin/prokka'

        for assembly in glob.glob(os.path.join(assemblies_folder, '*.fasta')):
            seqid = os.path.split(assembly)[1].split('.')[0]
            # Prepare command
            cmd = '{prokka} --outdir {output_folder} --prefix {seqid} {assembly}'.format(prokka=prokka,
                                                                                         output_folder=os.path.join(prokka_folder, seqid),
                                                                                         seqid=seqid,
                                                                                         assembly=assembly)

            # Create another shell script to execute within the PlasmidExtractor conda environment
            template = "#!/bin/bash\n{} && {}".format(activate, cmd)
            prokka_script = os.path.join(work_dir, 'run_prokka.sh')
            with open(prokka_script, 'w+') as file:
                file.write(template)
            make_executable(prokka_script)

            # Run shell script
            os.system(prokka_script)
        # 2) Figure out for each sequence if gram positive or negative
        # Psort says using Omp85 works pretty well for determining. Use Omp85 proteins from Neisseria, Thermosipho,
        # Synechoccus, and Thermus. If any hits with e value < 10^-3 gram positive, otherwise gram negative. There
        # are exceptions to this, but I don't think any of them are organisms that we work with/care about
        gram_pos_neg_dict = dict()
        protein_files = glob.glob(os.path.join(prokka_folder, '*', '*.faa'))
        for protein_file in protein_files:
            seqid = os.path.split(protein_file)[1].replace('.faa', '')
            # Make a blast DB from proteins.
            cmd = 'makeblastdb -in {} -dbtype prot'.format(protein_file)
            os.system(cmd)
            # Now BLAST our OMP85 proteins against the genome proteins.
            blast_result_file = protein_file.replace('.faa', '_blast.tsv')
            omp85_proteins = '/mnt/nas2/redmine/applications/OLCRedmineAutomator/data_and_stuff/omp85_proteins.fasta'
            cmd = 'blastp -db {} -query {} -out {} -outfmt "6 qseqid sseqid evalue"'.format(protein_file,
                                                                                            omp85_proteins,
                                                                                            blast_result_file)
            os.system(cmd)
            # Now parse through blast report to find if gram positive or negative
            has_omp_85 = False
            with open(blast_result_file) as f:
                for line in f:
                    evalue = float(line.rstrip().split()[-1])
                    if evalue < 0.0001:
                        has_omp_85 = True

            if has_omp_85 is True:
                gram_pos_neg_dict[seqid] = 'Negative'
            else:
                gram_pos_neg_dict[seqid] = 'Positive'

        """
        IMPORTANT NOTES ON GETTING PSORTB TO RUN:
        You'll need to 
        1) have pulled a docker image of PSORTB to each of the 
        nodes (docker pull brinkmanlab/psortb_commandline:1.0.2 should do the trick)
        2) Put a psortb executable into the data_and_stuff folder: 
        wget -O data_and_stuff/psortb https://raw.githubusercontent.com/brinkmanlab/psortb_commandline_docker/master/psortb
        3) chmod the psortb executable to actually make it executable.
        4) remove the 'sudo' from lines 35 and 61 of the psortb executable, otherwise nodes get unhappy, and make the -it
        in the commands into just a -i
        """

        # 3) Run PsortB!
        for seqid in seqids:
            protein_file = os.path.join(prokka_folder, seqid, seqid + '.faa')
            output_dir = os.path.join(prokka_folder, seqid)
            psortb_executable = '/mnt/nas2/redmine/applications/OLCRedmineAutomator/data_and_stuff/psortb'
            cmd = '{} -i {} -r {} '.format(psortb_executable, protein_file, output_dir)
            if gram_pos_neg_dict[seqid] == 'Negative':
                cmd += '--negative'
            else:
                cmd += '--positive'
            os.system(cmd)

        # Now need to: upload results, do file cleanup.
        report_dir = os.path.join(work_dir, 'psortb_reports_{}'.format(issue.id))
        os.makedirs(report_dir)
        raw_reports = glob.glob(os.path.join(prokka_folder, '*', '*psortb*.txt'))
        for raw_report in raw_reports:
            new_name = raw_report.split('/')[-2] + '_' + os.path.split(raw_report)[1]
            cmd = 'cp {} {}'.format(raw_report, os.path.join(report_dir, new_name))
            os.system(cmd)
        cmd = 'cp {} {}'.format(os.path.join(prokka_folder, '*', '*.faa'), report_dir)
        os.system(cmd)

        shutil.make_archive(report_dir, 'zip', report_dir)
        upload_successful = upload_to_ftp(local_file=report_dir + '.zip')
        redmine_instance.issue.update(resource_id=issue.id, status_id=4,
                                      notes='PsortB complete! Results available at: '
                                            'ftp://ftp.agr.gc.ca/outgoing/cfia-ak/{}'.format(os.path.split(report_dir)[1] + '.zip'))
        shutil.rmtree(assemblies_folder)
        shutil.rmtree(prokka_folder)
    except Exception as e:
        redmine_instance.issue.update(resource_id=issue.id,
                                      notes='Something went wrong! Send this error traceback to your friendly '
                                            'neighborhood bioinformatician: {}'.format(e))
コード例 #22
0
def resfinder_redmine(redmine_instance, issue, work_dir, description):
    # Unpickle Redmine objects
    redmine_instance = pickle.load(open(redmine_instance, 'rb'))
    issue = pickle.load(open(issue, 'rb'))
    description = pickle.load(open(description, 'rb'))

    try:
        # Parse description to figure out what SEQIDs we need to run on.
        seqids = list()
        for item in description:
            item = item.upper()
            seqids.append(item)

        retrieve_nas_files(seqids=seqids,
                           outdir=work_dir,
                           filetype='fasta',
                           copyflag=False)

        missing_fastas = verify_fasta_files_present(seqids, work_dir)
        if missing_fastas:
            redmine_instance.issue.update(
                resource_id=issue.id,
                notes='WARNING: Could not find the following requested SEQIDs on'
                ' the OLC NAS: {}'.format(missing_fastas))

        # Use the COWBAT_DATABASES variable as the database path
        db_path = COWBAT_DATABASES
        # Run ResFindr
        cmd = 'GeneSeekr blastn -s {seqfolder} -t {targetfolder} -r {reportdir} -A'\
            .format(seqfolder=work_dir,
                    targetfolder=os.path.join(db_path, 'resfinder'),
                    reportdir=os.path.join(work_dir, 'reports'))
        # Update the issue with the ResFinder command
        redmine_instance.issue.update(
            resource_id=issue.id,
            notes='ResFinder command:\n {cmd}'.format(cmd=cmd))
        os.system(cmd)
        # These unfortunate hard coded paths appear to be necessary
        activate = 'source /home/ubuntu/miniconda3/bin/activate /mnt/nas2/virtual_environments/cowbat'
        # Run sipprverse with the necessary arguments
        mob_cmd = 'python -m spadespipeline.mobrecon -s {seqfolder} -r {targetfolder}' \
            .format(seqfolder=work_dir,
                    targetfolder=os.path.join(db_path, 'mobrecon'))
        # Update the issue with the MOB Recon command
        redmine_instance.issue.update(
            resource_id=issue.id,
            notes='MOB Recon command:\n {cmd}'.format(cmd=mob_cmd))
        # Create another shell script to execute within the PlasmidExtractor conda environment
        template = "#!/bin/bash\n{} && {}".format(activate, mob_cmd)
        mob_script = os.path.join(work_dir, 'run_mob_recon.sh')
        with open(mob_script, 'w+') as file:
            file.write(template)
        # Modify the permissions of the script to allow it to be run on the node
        make_executable(mob_script)
        # Run shell script
        os.system(mob_script)
        # Get the output file uploaded.
        output_list = list()
        output_dict = dict()
        # Add the three reports separately to the output list
        output_dict['path'] = os.path.join(work_dir, 'reports',
                                           'resfinder_blastn.xlsx')
        output_dict['filename'] = 'resfinder_blastn.xlsx'
        output_list.append(output_dict)
        output_dict = dict()
        output_dict['path'] = os.path.join(work_dir, 'reports',
                                           'mob_recon_summary.csv')
        output_dict['filename'] = 'mob_recon_summary.csv'
        output_list.append(output_dict)
        output_dict = dict()
        output_dict['path'] = os.path.join(work_dir, 'reports',
                                           'amr_summary.csv')
        output_dict['filename'] = 'amr_summary.csv'
        output_list.append(output_dict)
        redmine_instance.issue.update(resource_id=issue.id,
                                      uploads=output_list,
                                      status_id=4,
                                      notes='resfinder process complete!')

        # Clean up all FASTA/FASTQ files so we don't take up too much space on the NAS
        os.system('rm {workdir}/*fasta'.format(workdir=work_dir))
        try:
            # Remove all other folders
            for dirpath, dirnames, filenames in os.walk(work_dir):
                for dirname in dirnames:
                    shutil.rmtree(os.path.join(dirpath, dirname))
        except IOError:
            pass
    except Exception as e:
        redmine_instance.issue.update(
            resource_id=issue.id,
            notes=
            'Something went wrong! Send this error traceback to your friendly '
            'neighborhood bioinformatician: {}'.format(e))
コード例 #23
0
def cowsnphr_redmine(redmine_instance, issue, work_dir, description):
    try:
        # Unpickle Redmine objects
        redmine_instance = pickle.load(open(redmine_instance, 'rb'))
        issue = pickle.load(open(issue, 'rb'))
        description = pickle.load(open(description, 'rb'))
        #
        query_list = list()
        reference = list()
        compare = False
        # Go through description to figure out what our query is and what the reference is.
        for item in description:
            item = item.upper()
            if item == '':
                continue
            if 'COMPARE' in item:
                compare = True
                continue
            if compare:
                query_list.append(item)
            else:
                if 'REFERENCE' not in item:
                    reference.append(item)
        # Create output folder
        reference_folder = os.path.join(work_dir, 'ref')
        os.makedirs(reference_folder)
        # Retrieve our reference file. Error user if they selected anything but one reference and don't continue.
        if len(reference) != 1:
            redmine_instance.issue.update(
                resource_id=issue.id,
                notes='ERROR: You must specify one reference strain, and you '
                'specified {} reference strains. Please create a new'
                ' issue and try again.'.format(len(reference)),
                status_id=4)
            return

        if reference[0].upper() != 'ATTACHED':
            # Extract our reference file to our working directory.
            retrieve_nas_files(seqids=reference,
                               outdir=reference_folder,
                               filetype='fasta',
                               copyflag=True)
            # Check that the file was successfully extracted. If it wasn't boot the user.
            if len(glob.glob(os.path.join(reference_folder, '*fasta'))) == 0:
                redmine_instance.issue.update(
                    resource_id=issue.id,
                    notes='ERROR: Could not find the specified reference file.'
                    ' Please verify it is a correct SEQID, create a new '
                    'issue, and try again.',
                    status_id=4)
                return

        # If user specified attachment as the reference file, download it to our working directory.
        else:
            # Get the attachment ID, and download if it isn't equal to zero (meaning no attachment, so boot user with
            # appropriate error message)
            attachment = redmine_instance.issue.get(issue.id,
                                                    include='attachments')
            attachment_id = 0
            ref_name = 'reference.fasta'
            for item in attachment.attachments:
                attachment_id = item.id
                ref_name = item.filename
            # Download if we found an attachment, and use as our reference. Otherwise, exit and tell user to try again
            if attachment_id != 0:
                attachment = redmine_instance.attachment.get(attachment_id)
                attachment.download(savepath=reference_folder,
                                    filename=ref_name)
            else:
                redmine_instance.issue.update(
                    resource_id=issue.id,
                    notes=
                    'ERROR: You specified that the reference would be in attached file,'
                    ' but no attached file was found. Please create a new issue and '
                    'try again.',
                    status_id=4)
                return

        # PROKKA
        # These unfortunate hard coded paths appear to be necessary
        activate = 'source /home/ubuntu/miniconda3/bin/activate /mnt/nas2/virtual_environments/prokka'
        prokka = '/mnt/nas2/virtual_environments/prokka/bin/prokka'

        # Prepare command
        ref_file = glob.glob(os.path.join(reference_folder, '*fasta'))[0]
        prefix = os.path.splitext(os.path.basename(ref_file))[0]
        cmd = '{prokka} --force --outdir {output_folder} --prefix {prefix} {ref_file}'\
            .format(prokka=prokka,
                    output_folder=reference_folder,
                    prefix=prefix,
                    ref_file=ref_file)
        # Update the issue with the GeneSeekr command
        redmine_instance.issue.update(
            resource_id=issue.id,
            notes='Prokka command:\n {cmd}'.format(cmd=cmd))
        # Create another shell script to execute within the PlasmidExtractor conda environment
        template = "#!/bin/bash\n{} && {}".format(activate, cmd)
        prokka_script = os.path.join(work_dir, 'run_prokka.sh')
        with open(prokka_script, 'w+') as file:
            file.write(template)
        make_executable(prokka_script)

        # Run shell script
        os.system(prokka_script)

        #
        seq_folder = os.path.join(work_dir, 'fastqs')
        # Now extract our query files.
        retrieve_nas_files(seqids=query_list,
                           outdir=seq_folder,
                           filetype='fastq',
                           copyflag=False)

        # With our query files extracted, verify that all the SEQIDs the user specified were able to be found.
        missing_fastqs = verify_fastqs_present(query_list, seq_folder)
        if len(missing_fastqs) > 0:
            redmine_instance.issue.update(
                resource_id=issue.id,
                notes=
                'Warning! Could not find the following requested query SEQIDs: '
                '{}. \nYou may want to verify the SEQIDs, create a new issue, and try'
                ' again.'.format(str(missing_fastqs)))

        # Now check that the FASTQ files aren't too far away from the specified reference file.
        bad_fastqs = check_distances(ref_fasta=glob.glob(
            os.path.join(reference_folder, '*fasta'))[0],
                                     fastq_folder=seq_folder,
                                     work_dir=work_dir)
        if len(bad_fastqs) > 0:
            redmine_instance.issue.update(
                resource_id=issue.id,
                notes='Warning! The following SEQIDs were found to be fairly'
                ' divergent from the reference file specified:{} \nYou may'
                ' want to start a new COWSNPhR issue without them and try '
                'again.'.format(str(bad_fastqs)))

        # COWSNPhR
        # These unfortunate hard coded paths appear to be necessary
        activate = 'source /home/ubuntu/miniconda3/bin/activate /mnt/nas2/virtual_environments/vsnp_dev'
        binary = '/mnt/nas2/virtual_environments/vSNP/cowsnphr/cowsnphr.py'

        # Prepare command
        cmd = '{bin} -s {seq_folder} -r {ref_folder} -w /mnt/nas2' \
            .format(bin=binary,
                    seq_folder=seq_folder,
                    ref_folder=reference_folder)
        # Update the issue with the GeneSeekr command
        redmine_instance.issue.update(
            resource_id=issue.id,
            notes='COWSNPhR command:\n {cmd}'.format(cmd=cmd))
        # Create another shell script to execute within the PlasmidExtractor conda environment
        template = "#!/bin/bash\n{} && {}".format(activate, cmd)
        cowsnphr_script = os.path.join(work_dir, 'run_cowsnphr.sh')
        with open(cowsnphr_script, 'w+') as file:
            file.write(template)
        make_executable(cowsnphr_script)

        # Run shell script
        os.system(cowsnphr_script)

        # Zip output
        output_filename = 'cowsnphr_output_{}'.format(issue.id)
        zip_filepath = zip_folder(output_dir=work_dir,
                                  output_filename=output_filename)
        zip_filepath += '.zip'
        #
        upload_successful = upload_to_ftp(local_file=zip_filepath)
        # Prepare upload
        if upload_successful:
            redmine_instance.issue.update(
                resource_id=issue.id,
                status_id=4,
                notes='COWSNPhr process complete!\n\n'
                'Results are available at the following FTP address:\n'
                'ftp://ftp.agr.gc.ca/outgoing/cfia-ak/{}'.format(
                    os.path.split(zip_filepath)[1]))
        else:
            redmine_instance.issue.update(
                resource_id=issue.id,
                status_id=4,
                notes=
                'Upload of result files was unsuccessful due to FTP connectivity '
                'issues. Please try again later.')
        # Clean up files
        shutil.rmtree(reference_folder)
        shutil.rmtree(seq_folder)
    except Exception as e:
        redmine_instance.issue.update(
            resource_id=issue.id,
            notes=
            'Something went wrong! Send this error traceback to your friendly '
            'neighborhood bioinformatician: {}'.format(e))
コード例 #24
0
def intimin_typer_redmine(redmine_instance, issue, work_dir, description):
    # Unpickle Redmine objects
    redmine_instance = pickle.load(open(redmine_instance, 'rb'))
    issue = pickle.load(open(issue, 'rb'))
    description = pickle.load(open(description, 'rb'))

    try:
        # Parse description to get list of SeqIDs
        seqids = list()
        for i in range(0, len(description)):
            item = description[i]
            item = item.upper()

            # Minimal check to make sure IDs provided somewhat resemble a valid sample ID
            if item.isalpha():
                pass
            else:
                seqids.append(item)

        # Create folder to drop FASTQ files
        fasta_folder = os.path.join(work_dir, 'fasta_files')

        # Extract FASTA files.
        retrieve_nas_files(seqids=seqids, outdir=fasta_folder, filetype='fasta', copyflag=False)

        # These unfortunate hard coded paths appear to be necessary
        activate = 'source /home/ubuntu/miniconda3/bin/activate /mnt/nas2/virtual_environments/phylotyper'
        phylotyper_py = '/mnt/nas2/virtual_environments/phylotyper/bin/phylotyper'

        output_dir = os.path.join(work_dir, 'intimin_subtype_output')
        # Prepare command
        fasta_files = sorted(glob.glob(os.path.join(fasta_folder, '*.fasta')))
        cmd = '{phylotyper_py} genome eae {output_dir} '.format(phylotyper_py=phylotyper_py, output_dir=output_dir)
        for fasta_file in fasta_files:
            cmd += fasta_file + ' '

        # Create another shell script to execute within the PlasmidExtractor conda environment
        template = "#!/bin/bash\n{} && {}".format(activate, cmd)
        phylotyper_script = os.path.join(work_dir, 'run_phylotyper.sh')
        with open(phylotyper_script, 'w+') as file:
            file.write(template)
        make_executable(phylotyper_script)

        # Run shell script
        os.system(phylotyper_script)

        # Clean up fairly large html files that phylotyper makes so we don't compe anywhere near redmine upload size
        # limit.
        os.system('rm {}'.format(os.path.join(output_dir, '*.html')))

        # Prepare upload
        output_list = [
            {
                'filename': 'intimin_predictions.tsv',
                'path': os.path.join(output_dir, 'subtype_predictions.tsv')
            }
        ]

        # Wrap up issue
        redmine_instance.issue.update(resource_id=issue.id, uploads=output_list, status_id=4,
                                      notes='Intimin subtyping complete!')
    except Exception as e:
        redmine_instance.issue.update(resource_id=issue.id, status_id=4,
                                      notes='Something went wrong, fix me!')
コード例 #25
0
def primer_finder_redmine(redmine_instance, issue, work_dir, description):
    sentry_sdk.init(SENTRY_DSN, before_send=before_send)
    # Unpickle Redmine objects
    redmine_instance = pickle.load(open(redmine_instance, 'rb'))
    issue = pickle.load(open(issue, 'rb'))
    description = pickle.load(open(description, 'rb'))
    # Programs supported by the automator
    programs = ['legacy', 'supremacy']
    # Current list of analysis types that the primer finder can support
    analyses = ['vtyper', 'custom']
    # List of supported file formats
    formats = ['fasta', 'fastq']
    # Acceptable number of mismatches
    mismatches = [0, 1, 2, 3]
    # Variable to hold supplied arguments
    argument_dict = {
        'program': str(),
        'analysis': str(),
        'mismatches': 2,
        'kmersize': '55,77,99,127',
        'format': 'fasta',
        'exportamplicons': False,
    }
    try:
        # Parse description to figure out what SEQIDs we need to run on.
        seqids = list()
        for item in description:
            item = item.upper().rstrip()
            if 'PROGRAM' in item:
                argument_dict['program'] = item.split('=')[1].lower()
                continue
            if 'ANALYSIS' in item:
                argument_dict['analysis'] = item.split('=')[1].lower()
                continue
            if 'MISMATCHES' in item:
                argument_dict['mismatches'] = int(item.split('=')[1].lower())
                continue
            if 'KMERSIZE' in item:
                argument_dict['kmersize'] = item.split('=')[1].lower()
                continue
            if 'FORMAT' in item:
                argument_dict['format'] = item.split('=')[1].lower()
                continue
            if 'EXPORTAMPLICONS' in item:
                argument_dict['exportamplicons'] = True
                continue
            # Otherwise the item should be a SEQID
            seqids.append(item)
        # Ensure that the analysis type is provided
        if not argument_dict['program']:
            redmine_instance.issue \
                .update(resource_id=issue.id,
                        notes='WARNING: No program type provided. Please ensure that issue contains '
                              '"program=requested_program", where requested_program is one of the '
                              'following keywords: {pts}. Please see the the usage guide: '
                              'https://olc-bioinformatics.github.io/redmine-docs/analysis/primerfinder/ '
                              'for additional details'.format(pts=','.join(programs)))
            return
        if argument_dict['program'] not in programs:
            redmine_instance.issue \
                .update(resource_id=issue.id,
                        notes='WARNING: Requested program type: {pt} not in list of supported analyses: {pts}. Please '
                              'see https://olc-bioinformatics.github.io/redmine-docs/analysis/primerfinder/ '
                              'for additional details'.format(pt=argument_dict['program'],
                                                              pts=','.join(programs)))
            return
        # Custom analyses must have an attached FASTA file of primer sequences
        if argument_dict['analysis'] == 'custom':
            # Download the attached FASTA file.
            # First, get the attachment id - this seems like a kind of hacky way to do this, but I have yet to figure
            # out a better way to do it.
            attachment = redmine_instance.issue.get(issue.id,
                                                    include='attachments')
            attachment_id = 0
            for item in attachment.attachments:
                attachment_id = item.id
            # Set the name of and create the folder to store the targets
            target_file = os.path.join(work_dir, 'primers.txt')
            # Download if attachment id is not 0, which indicates that we didn't find anything attached to the issue.
            if attachment_id != 0:
                attachment = redmine_instance.attachment.get(attachment_id)
                attachment.download(savepath=work_dir, filename='primers.txt')
            else:
                redmine_instance.issue.update(
                    resource_id=issue.id,
                    notes=
                    'ERROR: Analysis type custom requires an attached FASTA file of '
                    'targets. The automator could not find any attached files. '
                    'Please create a new issue with the FASTA file attached and try '
                    'again.',
                    status_id=4)
                return
        # Use the v-typer primer set included in the package for V-typer analyses
        else:
            target_file = \
                '/mnt/nas2/virtual_environments/in_silico_pcr/lib/python3.6/site-packages/spadespipeline/primers.txt'
        # Ensure that the analysis type is provided
        if not argument_dict['analysis']:
            redmine_instance.issue\
                .update(resource_id=issue.id,
                        notes='WARNING: No analysis type provided. Please ensure that issue contains '
                              '"analysistype=requested_analysis_type", where requested_analysis_type is one of the '
                              'following keywords: {ats}. Please see the the usage guide: '
                              'https://olc-bioinformatics.github.io/redmine-docs/analysis/primerfinder/ '
                              'for additional details'.format(ats=','.join(analyses)))
            return
        # Ensure that the supplied analysis type is valid
        if argument_dict['analysis'] not in analyses:
            redmine_instance.issue\
                .update(resource_id=issue.id,
                        notes='WARNING: Requested analysis type: {at} not in list of supported analyses: {ats}. Please '
                              'see https://olc-bioinformatics.github.io/redmine-docs/analysis/primerfinder/ '
                              'for additional details'.format(at=argument_dict['analysis'],
                                                              ats=','.join(analyses)))
            return
        # Make sure that the supplied file format is valid
        if argument_dict['format'] not in formats:
            redmine_instance.issue\
                .update(resource_id=issue.id,
                        notes='WARNING: Requested file format {ft} not in list of supported formats: {fts}. Please '
                              'see https://olc-bioinformatics.github.io/redmine-docs/analysis/primerfinder/ '
                              'for additional details'.format(ft=argument_dict['format'],
                                                              fts=','.join(formats)))
            return
        # Ensure that the number of mismatches is acceptable
        if argument_dict['mismatches'] not in mismatches:
            redmine_instance.issue\
                .update(resource_id=issue.id,
                        notes='WARNING: Requested number of mismatches, {nm}, is not in the acceptable range of allowed'
                              ' mismatches: {nms}. Please '
                              'see https://olc-bioinformatics.github.io/redmine-docs/analysis/primerfinder/ '
                              'for additional details'.format(nm=argument_dict['mismatches'],
                                                              nms=','.join([str(num) for num in mismatches])))
            return
        # Ensure that SEQIDs were included
        if not seqids:
            redmine_instance.issue.update(resource_id=issue.id,
                                          notes='WARNING: No SEQIDs provided!')
        # Run file linker and then make sure that all files requested are present. Warn user if they
        # requested things that we don't have.
        retrieve_nas_files(
            seqids=seqids,
            outdir=work_dir,
            filetype=argument_dict['format']
            if argument_dict['program'] == 'supremacy' else 'fasta',
            copyflag=False)
        missing_files = verify_sequence_files_present(
            seqid_list=seqids,
            seq_dir=work_dir,
            file_type=argument_dict['format'])
        # Update the Redmine issue if one or more of the requested SEQIDs could not be located
        if missing_files:
            redmine_instance.issue.update(
                resource_id=issue.id,
                notes='WARNING: Could not find the following requested SEQIDs on'
                ' the OLC NAS: {mf}'.format(mf=missing_files))
        # These unfortunate hard coded paths appear to be necessary
        activate = 'source /home/ubuntu/miniconda3/bin/activate /mnt/nas2/virtual_environments/in_silico_pcr'
        # Call the appropriate script depending on the requested program
        if argument_dict['program'] == 'legacy':
            # Run legacy primer locator with the necessary arguments
            primer_cmd = 'python -m spadespipeline.legacy_vtyper -s {seqpath} -m {mismatches}'\
                .format(seqpath=work_dir,
                        mismatches=argument_dict['mismatches'])
            # Add additional flags as required
            if argument_dict['analysis'] == 'custom':
                primer_cmd += ' -a {at}'.format(at=argument_dict['analysis'])
                primer_cmd += ' -pf {primer_file}'.format(
                    primer_file=target_file)
            primer_cmd += ' -e' if argument_dict['exportamplicons'] else ''
        else:
            # Run legacy primer locator with the necessary arguments
            primer_cmd = 'python -m spadespipeline.primer_finder_bbduk -p {seqpath} -s {seqpath} -m {mismatches} ' \
                         '-k {kmerlength} -pf {primer_file}' \
                .format(seqpath=work_dir,
                        mismatches=argument_dict['mismatches'],
                        kmerlength=argument_dict['kmersize'],
                        primer_file=target_file
                        )
        redmine_instance.issue.update(
            resource_id=issue.id,
            notes='Primer finder command:\n {cmd}'.format(cmd=primer_cmd))
        # Create another shell script to execute within the conda environment
        template = "#!/bin/bash\n{} && {}".format(activate, primer_cmd)
        primer_finder_script = os.path.join(work_dir, 'run_primer_finder.sh')
        with open(primer_finder_script, 'w+') as file:
            file.write(template)
        # Modify the permissions of the script to allow it to be run on the node
        make_executable(primer_finder_script)
        # Run shell script
        os.system(primer_finder_script)

        # Zip output
        output_filename = 'primer_finder_output'
        zip_filepath = zip_folder(output_dir=work_dir,
                                  output_filename=output_filename,
                                  program=argument_dict['program'])
        zip_filepath += '.zip'
        # Prepare upload
        output_list = [{
            'filename': os.path.basename(zip_filepath),
            'path': zip_filepath
        }]
        # Create a list of all the folders - will be used to clean up the working directory
        folders = glob.glob(os.path.join(work_dir, '*/'))
        # Remove all the folders
        for folder in folders:
            if os.path.isdir(folder):
                shutil.rmtree(folder)
        # Wrap up issue
        redmine_instance.issue.update(
            resource_id=issue.id,
            uploads=output_list,
            status_id=4,
            notes='{at} analysis with primer finder complete!'.format(
                at=argument_dict['analysis'].lower()))
    except Exception as e:
        sentry_sdk.capture_exception(e)
        redmine_instance.issue.update(
            resource_id=issue.id,
            notes=
            'Something went wrong! We log this automatically and will look into the '
            'problem and get back to you with a fix soon.')
コード例 #26
0
def merge_redmine(redmine_instance, issue, work_dir, description):
    # Unpickle Redmine objects
    redmine_instance = pickle.load(open(redmine_instance, 'rb'))
    issue = pickle.load(open(issue, 'rb'))
    description = pickle.load(open(description, 'rb'))

    try:
        # Download the attached excel file.
        # First, get the attachment id - this seems like a kind of hacky way to do this, but I have yet to figure
        # out a better way to do it.
        attachment = redmine_instance.issue.get(issue.id,
                                                include='attachments')
        attachment_id = 0
        for item in attachment.attachments:
            attachment_id = item.id

        # Now download, if attachment id is not 0, which indicates that we didn't find anything attached to the issue.
        if attachment_id != 0:
            attachment = redmine_instance.attachment.get(attachment_id)
            attachment.download(savepath=work_dir, filename='merge.xlsx')
        else:
            redmine_instance.issue.update(
                resource_id=issue.id,
                notes=
                'ERROR: Did not find any attached files. Please create a new issue with '
                'the merge excel file attached and try again.',
                status_id=4)
            return

        # Now use convert_excel_file to make compatible with merger.py
        convert_excel_file(os.path.join(work_dir, 'merge.xlsx'),
                           os.path.join(work_dir, 'Merge.xlsx'))

        # Make a SEQID list of files we'll need to extract.
        seqid_list = generate_seqid_list(os.path.join(work_dir, 'Merge.xlsx'))

        # Create links of all seqids needed in our working dir
        retrieve_nas_files(seqids=seqid_list,
                           outdir=work_dir,
                           filetype='fastq',
                           copyflag=False)

        # Run the merger script.
        cmd = 'python /mnt/nas/Redmine/OLCRedmineAutomator/automators/merger.py -f {} -d ";" {}'.format(
            os.path.join(work_dir, 'Merge.xlsx'), work_dir)
        os.system(cmd)

        # Make a folder to put all the merged FASTQs in biorequest folder. and put the merged FASTQs there.
        os.makedirs(os.path.join(work_dir, 'merged_' + str(issue.id)))
        cmd = 'mv {merged_files} {merged_folder}'.format(
            merged_files=os.path.join(work_dir, '*MER*/*.fastq.gz'),
            merged_folder=os.path.join(work_dir, 'merged_' + str(issue.id)))
        os.system(cmd)

        if len(
                glob.glob(
                    os.path.join(work_dir, 'merged_' + str(issue.id),
                                 '*fastq.gz'))) == 0:
            redmine_instance.issue.update(
                resource_id=issue.id,
                notes=
                'ERROR: Something went wrong, no merged FASTQ files were created.',
                status_id=4)
            return
        # Now copy those merged FASTQS to merge backup and the hdfs folder so they can be assembled.
        cmd = 'cp {merged_files} /mnt/nas/merge_Backup'.format(
            merged_files=os.path.join(work_dir, 'merged_' +
                                      str(issue.id), '*.fastq.gz'))
        os.system(cmd)

        cmd = 'cp -r {merged_folder} /hdfs'.format(
            merged_folder=os.path.join(work_dir, 'merged_' + str(issue.id)))
        os.system(cmd)

        redmine_instance.issue.update(
            resource_id=issue.id,
            notes=
            'Merged FASTQ files created, beginning assembly of merged files.')
        # With files copied over to the HDFS, start the assembly process.
        os.system('docker rm -f spadespipeline')
        # Run docker image.
        cmd = 'docker run -i -u $(id -u) -v /mnt/nas/Adam/spadespipeline/OLCspades/:/spadesfiles ' \
              '-v /mnt/nas/Adam/assemblypipeline/:/pipelinefiles -v  {}:/sequences ' \
              '--name spadespipeline pipeline:0.1.5 OLCspades.py ' \
              '/sequences -r /pipelinefiles'.format(os.path.join('/hdfs', 'merged_' + str(issue.id)))
        os.system(cmd)
        # Remove the container.
        os.system('docker rm -f spadespipeline')

        # Move results to merge_WGSspades, and upload the results folder to redmine.
        cmd = 'mv {hdfs_folder} {merge_WGSspades}'.format(
            hdfs_folder=os.path.join('/hdfs', 'merged_' + str(issue.id)),
            merge_WGSspades=os.path.join(
                '/mnt/nas/merge_WGSspades',
                'merged_' + str(issue.id) + '_Assembled'))
        os.system(cmd)
        shutil.make_archive(
            os.path.join(work_dir, 'reports'), 'zip',
            os.path.join('/mnt/nas/merge_WGSspades',
                         'merged_' + str(issue.id) + '_Assembled', 'reports'))
        output_list = list()
        output_dict = dict()
        output_dict['path'] = os.path.join(work_dir, 'reports.zip')
        output_dict['filename'] = 'merged_' + str(issue.id) + '_reports.zip'
        output_list.append(output_dict)
        redmine_instance.issue.update(
            resource_id=issue.id,
            uploads=output_list,
            status_id=4,
            notes='Merge Process Complete! Reports attached.')
    except Exception as e:
        redmine_instance.issue.update(
            resource_id=issue.id,
            notes=
            'Something went wrong! Send this error traceback to your friendly '
            'neighborhood bioinformatician: {}'.format(e))
コード例 #27
0
def plasmid_borne_identity(redmine_instance, issue, work_dir, description):
    sentry_sdk.init(SENTRY_DSN, before_send=before_send)
    # Unpickle Redmine objects
    redmine_instance = pickle.load(open(redmine_instance, 'rb'))
    issue = pickle.load(open(issue, 'rb'))
    description = pickle.load(open(description, 'rb'))
    # Variable to hold supplied arguments
    argument_dict = {
        'analysis': 'custom',
        'blast': 'blastn',
        'cutoff': 70,
        'evalue': '1E-5',
    }
    # Set the database path for the analyses
    database_path = {
        'custom': os.path.join(work_dir, 'targets')
    }
    # Current BLAST analyses supported
    blasts = ['blastn', 'blastp', 'blastx', 'tblastn', 'tblastx']
    try:
        # Parse description to figure out what SEQIDs we need to run on.
        seqids = list()
        for item in description:
            item = item.upper().rstrip()
            if 'CUTOFF' in item:
                argument_dict['cutoff'] = int(item.split('=')[1].lower())
                continue
            if 'EVALUE' in item:
                argument_dict['evalue'] = item.split('=')[1].lower()
                continue
            if 'BLAST' in item:
                argument_dict['blast'] = item.split('=')[1].lower()
                continue
            # Otherwise the item should be a SEQID
            seqids.append(item)

        retrieve_nas_files(seqids=seqids,
                           outdir=work_dir,
                           filetype='fasta',
                           copyflag=False)

        missing_fastas = verify_fasta_files_present(seqids, work_dir)
        if missing_fastas:
            redmine_instance.issue.update(resource_id=issue.id,
                                          notes='WARNING: Could not find the following requested SEQIDs on'
                                                ' the OLC NAS: {}'.format(missing_fastas))

        # Set and create the directory to store the custom targets
        target_dir = os.path.join(work_dir, 'targets')
        try:
            os.mkdir(target_dir)
        except FileExistsError:
            pass
        # Download the attached FASTA file.
        # First, get the attachment id - this seems like a kind of hacky way to do this, but I have yet to figure
        # out a better way to do it.
        attachment = redmine_instance.issue.get(issue.id, include='attachments')
        attachment_id = 0
        for item in attachment.attachments:
            attachment_id = item.id
        # Download if attachment id is not 0, which indicates that we didn't find anything attached to the issue.
        if attachment_id != 0:
            attachment = redmine_instance.attachment.get(attachment_id)
            attachment.download(savepath=target_dir, filename='targets.tfa')
        else:
            redmine_instance.issue.update(resource_id=issue.id,
                                          notes='ERROR: Analysis type custom requires an attached FASTA file of '
                                                'targets. The automator could not find any attached files. '
                                                'Please create a new issue with the FASTA file attached and try '
                                                'again.',
                                          status_id=4)
            return
        # Ensure that the requested BLAST analysis is valid
        if argument_dict['blast'] not in blasts:
            redmine_instance.issue.update(resource_id=issue.id,
                                          notes='WARNING: requested BLAST analysis, {bt}, is not one of the currently '
                                                'supported analyses: {blasts}'.format(bt=argument_dict['blast'],
                                                                                      blasts=', '.join(blasts)),
                                          status_id=4)
            return
        # These unfortunate hard coded paths appear to be necessary
        activate = 'source /home/ubuntu/miniconda3/bin/activate /mnt/nas2/virtual_environments/cowbat'
        seekr_py = '/mnt/nas2/virtual_environments/geneseekr/bin/GeneSeekr'
        # Run sipprverse with the necessary arguments
        seekr_cmd = 'python {seekr_py} {blast} -s {seqpath} -r {outpath} -t {dbpath} -c {cutoff} -e {evalue}' \
            .format(seekr_py=seekr_py,
                    blast=argument_dict['blast'],
                    seqpath=work_dir,
                    outpath=os.path.join(work_dir, 'reports'),
                    dbpath=database_path[argument_dict['analysis']],
                    cutoff=argument_dict['cutoff'],
                    evalue=argument_dict['evalue'])
        # Update the issue with the GeneSeekr command
        redmine_instance.issue.update(resource_id=issue.id,
                                      notes='GeneSeekr command:\n {cmd}'.format(cmd=seekr_cmd))
        # Create another shell script to execute within the PlasmidExtractor conda environment
        template = "#!/bin/bash\n{} && {}".format(activate, seekr_cmd)
        geneseekr_script = os.path.join(work_dir, 'run_geneseekr.sh')
        with open(geneseekr_script, 'w+') as file:
            file.write(template)
        # Modify the permissions of the script to allow it to be run on the node
        make_executable(geneseekr_script)
        # Run shell script
        os.system(geneseekr_script)
        # Run MOB Recon
        # Use the COWBAT_DATABASES variable as the database path
        db_path = COWBAT_DATABASES
        # These unfortunate hard coded paths appear to be necessary
        activate = 'source /home/ubuntu/miniconda3/bin/activate /mnt/nas2/virtual_environments/cowbat'
        # Run sipprverse with the necessary arguments
        mob_cmd = 'python -m spadespipeline.mobrecon -s {seqfolder} -r {targetfolder} -a geneseekr -b {blast}' \
            .format(seqfolder=work_dir,
                    targetfolder=os.path.join(db_path, 'mobrecon'),
                    blast=argument_dict['blast'])
        # Update the issue with the MOB Recon command
        redmine_instance.issue.update(resource_id=issue.id,
                                      notes='MOB Recon command:\n {cmd}'.format(cmd=mob_cmd))
        # Create another shell script to execute within the PlasmidExtractor conda environment
        template = "#!/bin/bash\n{} && {}".format(activate, mob_cmd)
        mob_script = os.path.join(work_dir, 'run_mob_recon.sh')
        with open(mob_script, 'w+') as file:
            file.write(template)
        # Modify the permissions of the script to allow it to be run on the node
        make_executable(mob_script)
        # Run shell script
        os.system(mob_script)
        # Get the output file uploaded.
        output_list = list()
        output_dict = dict()
        # Add the reports separately to the output list
        # GeneSeekr Excel-formatted report
        output_dict['path'] = os.path.join(work_dir, 'reports', 'geneseekr_{blast}.xlsx'
                                           .format(blast=argument_dict['blast']))
        output_dict['filename'] = 'geneseekr_{blast}.xlsx'.format(blast=argument_dict['blast'])
        output_list.append(output_dict)
        # Detailed GeneSeekr report
        output_dict = dict()
        output_dict['path'] = os.path.join(work_dir, 'reports', 'geneseekr_{blast}_detailed.csv'
                                           .format(blast=argument_dict['blast']))
        output_dict['filename'] = 'geneseekr_{blast}_detailed.csv'.format(blast=argument_dict['blast'])
        output_list.append(output_dict)
        # Simple GeneSeekr report
        output_dict = dict()
        output_dict['path'] = os.path.join(work_dir, 'reports', 'geneseekr_{blast}.csv'
                                           .format(blast=argument_dict['blast']))
        output_dict['filename'] = 'geneseekr_{blast}.csv'.format(blast=argument_dict['blast'])
        output_list.append(output_dict)
        # MOB Recon summary report
        output_dict = dict()
        output_dict['path'] = os.path.join(work_dir, 'reports', 'mob_recon_summary.csv')
        output_dict['filename'] = 'mob_recon_summary.csv'
        output_list.append(output_dict)
        # Plasmid-borne summary report
        output_dict = dict()
        output_dict['path'] = os.path.join(work_dir, 'reports', 'plasmid_borne_summary.csv')
        output_dict['filename'] = 'plasmid_borne_summary.csv'
        output_list.append(output_dict)
        redmine_instance.issue.update(resource_id=issue.id, uploads=output_list, status_id=4,
                                      notes='PlasmidBorne identity process complete!')

        # Clean up all FASTA/FASTQ files so we don't take up too much space on the NAS
        os.system('rm {workdir}/*fasta'.format(workdir=work_dir))
        try:
            # Remove all other folders
            for dirpath, dirnames, filenames in os.walk(work_dir):
                for dirname in dirnames:
                    shutil.rmtree(os.path.join(dirpath, dirname))
        except IOError:
            pass
    except Exception as e:
        sentry_sdk.capture_exception(e)
        redmine_instance.issue.update(resource_id=issue.id,
                                      notes='Something went wrong! We log this automatically and will look into the '
                                            'problem and get back to you with a fix soon.')