Esempio n. 1
0
    def _add_options(self):
        """Function for adding the parser and options to the given ArgumentParser.

        Notes
        -----
            Uses the class constructor's subparser object for appending the tool's parser and options.

        """
        # Create the description and options for the parser.
        description = "Create a VDJ or VJ model by executing IGoR's commandline tool via a python subprocess using default " \
            "model parameters."
        parser_options = {
            '-seqs': {
                'metavar':
                '<fasta/separated>',
                'required':
                'True',
                'type':
                'str',
                'help':
                "An input FASTA or separated data file with sequences for training the model."
            },
            '-ref': {
                'metavar': ('<gene>', '<fasta>'),
                'type':
                'str',
                'action':
                'append',
                'nargs':
                2,
                'required':
                'True',
                'help':
                "A gene (V, D or J) followed by a reference genome FASTA file. Note: the FASTA reference genome files "
                "needs to conform to IGMT annotation (separated by '|' character)."
            },
            '-type': {
                'type': 'str.lower',
                'choices': ['alpha', 'beta', 'light', 'heavy'],
                'required': 'True',
                'help':
                'The type of model to create. (select one: %(choices)s).'
            },
            '-n-iter': {
                'type':
                'int',
                'nargs':
                '?',
                'help':
                'The number of inference iterations to perform when creating the model (default: {}).'
                .format(get_config_data('BUILD', 'NUM_ITERATIONS', 'int'))
            }
        }

        # Add the options to the parser and return the updated parser.
        parser_tool = self.subparsers.add_parser('build',
                                                 help=description,
                                                 description=description)
        parser_tool = dynamic_cli_options(parser=parser_tool,
                                          options=parser_options)
Esempio n. 2
0
    def _add_options(self):
        """Function for adding the parser/options to the input ArgumentParser.

        Notes
        -----
            Uses the class constructor's subparser object for appending the tool's parser and options.

        """
        # Create the description and options for the parser.
        description = "Create an alignment for the given reference genome FASTA files and seach the given alignment for " \
            "conserved motif regions. The located CDR3 anchors can be used for the other tools."
        parser_options = {
            '-ref': {
                'metavar': ('<gene>', '<fasta>'),
                'type':
                'str',
                'action':
                'append',
                'nargs':
                2,
                'required':
                'True',
                'help':
                "A gene (V or J) followed by a reference genome FASTA file. Note: the FASTA reference genome files "
                "needs to conform to IGMT annotation (separated by '|' character)."
            },
            '-motif': {
                'type':
                'str.upper',
                'action':
                'append',
                'help':
                "The motifs to look for (default: 'V' {} and 'J' {} respectivly)."
                .format(
                    get_config_data('LOCATE', 'V_MOTIFS').split(','),
                    get_config_data('LOCATE', 'J_MOTIFS').split(','))
            }
        }

        # Add the options to the parser and return the updated parser.
        parser_tool = self.subparsers.add_parser('locate',
                                                 help=description,
                                                 description=description)
        parser_tool = dynamic_cli_options(parser=parser_tool,
                                          options=parser_options)
Esempio n. 3
0
def main():
    """Function to create the ArgumentParser containing the sub-options."""
    # Setting up the logger.
    logging.basicConfig(
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        level=os.environ.get("LOGLEVEL", "INFO")
    )
    logger = logging.getLogger(__name__)

    # Create the parser with general commands and set the subparser.
    description = 'Create IGoR models and calculate the generation probability of V(D)J and CDR3 sequences.'
    parser_general_options = {
        '-separator': {
            'type': 'str.lower',
            'choices': ['tab', 'semi-colon', 'comma'],
            'help': 'The separator character used for input files and for writing new files (select one: %(choices)s) '
                    '(default: {}).'.format(
                        {'\t': 'tab', ';': 'semi-colon', ',': 'comma'}[get_config_data('COMMON', 'SEPARATOR')])
        },
        '-threads': {
            'type': 'int',
            'nargs': '?',
            'help': 'The number of threads the program is allowed to use (default: {}).'
                    .format(get_config_data('COMMON', 'NUM_THREADS', 'int'))
        },
        '-set-wd': {
            'type': 'str',
            'nargs': '?',
            'help': 'An optional location for writing files (default: {}).'.format(get_config_data('COMMON', 'WORKING_DIR'))
        },
        '-out-name': {
            'type': 'str',
            'nargs': '?',
            'help': 'An optional output file name. If multiple files are created, the value is used as a prefix for the file '
                    '(default: {}).'.format(get_config_data('COMMON', 'OUT_NAME'))
        },
        '-config-file': {
            'type': 'str',
            'nargs': '?',
            'help': 'An optional configuration file path for ImmunoProbs. This file is always combined with the default '
                    'configuration to make up missing values.'
        },
    }
    parser = argparse.ArgumentParser(prog='immuno-probs', description=description)
    parser = dynamic_cli_options(parser=parser, options=parser_general_options)
    subparsers = parser.add_subparsers(
        help='Supported immuno-probs options, command plus help displays more information for the option.',
        dest='subparser_name'
    )

    # Add main- and suboptions to the subparser.
    logger.info('Setting up ImmunoProbs commandline tools')
    try:
        cas = ConvertAdaptiveSequences(subparsers=subparsers)
        lca = LocateCdr3Anchors(subparsers=subparsers)
        bim = BuildIgorModel(subparsers=subparsers)
        ges = GenerateSequences(subparsers=subparsers)
        evs = EvaluateSequences(subparsers=subparsers)
    except (TypeError) as err:
        logger.error(str(err))
        return

    # Parse the commandline arguments and set variables.
    logger.info('Parsing/formatting commandline arguments')
    try:
        parsed_arguments = parser.parse_args()
        if parsed_arguments.config_file is not None:
            set_config_data(parsed_arguments.config_file)
        if parsed_arguments.separator is not None:
            set_separator(parsed_arguments.separator)
        if parsed_arguments.threads is not None:
            set_num_threads(parsed_arguments.threads)
        if parsed_arguments.set_wd is not None:
            set_working_dir(parsed_arguments.set_wd)
        if parsed_arguments.out_name is not None:
            set_out_name(parsed_arguments.out_name)
    except (TypeError, ValueError, IOError) as err:
        logger.error(str(err))
        return

    # Create the directory paths for temporary files.
    logger.info('Setting up temporary system directory')
    try:
        output_dir = get_config_data('COMMON', 'WORKING_DIR')
        if get_config_data('EXPERT', 'USE_SYSTEM_TEMP', 'bool'):
            temp_dir = create_directory_path(os.path.join(tempfile.gettempdir(), get_config_data('EXPERT', 'TEMP_DIR')))
        else:
            temp_dir = create_directory_path(os.path.join(output_dir, get_config_data('EXPERT', 'TEMP_DIR')))
        set_working_dir(temp_dir)
    except (IOError, AttributeError) as err:
        logger.error(str(err))
        return

    # Execute the correct tool based on given subparser name.
    logger.info('Executing selected ImmunoProbs tool (%s)', parsed_arguments.subparser_name)
    if parsed_arguments.subparser_name == 'convert':
        cas.run(args=parsed_arguments, output_dir=output_dir)
    elif parsed_arguments.subparser_name == 'locate':
        lca.run(args=parsed_arguments, output_dir=output_dir)
    elif parsed_arguments.subparser_name == 'build':
        bim.run(args=parsed_arguments, output_dir=output_dir)
    elif parsed_arguments.subparser_name == 'generate':
        ges.run(args=parsed_arguments, output_dir=output_dir)
    elif parsed_arguments.subparser_name == 'evaluate':
        evs.run(args=parsed_arguments, output_dir=output_dir)
    else:
        logger.error('No tool selected, run help command to show all supported tools')

    # Finally, delete the temporary directory if specified.
    if get_config_data('EXPERT', 'REMOVE_TEMP_DIR', 'bool'):
        logger.info('Cleaning up working directory')
        rmtree(temp_dir, ignore_errors=True)
Esempio n. 4
0
    def _add_options(self):
        """Function for adding the parser and options to the given ArgumentParser.

        Notes
        -----
            Uses the class constructor's subparser object for appending the tool's parser and options.

        """
        # Create the description and options for the parser.
        description = "Evaluate VDJ or VJ sequences given a custom IGoR model (or build-in) through IGoR's commandline " \
            "tool via python subprocess. Or evaluate CDR3 sequences with the model by using OLGA."
        parser_options = {
            '-seqs': {
                'metavar': '<fasta/separated>',
                'required': 'True',
                'type': 'str',
                'help': "An input FASTA or separated data file with sequences to evaluate."
            },
            '-model': {
                'type': 'str.lower',
                'choices': get_default_model_file_paths(),
                'required': '-custom-model' not in sys.argv,
                'help': "Specify a pre-installed model for evaluation. (required if -custom-model NOT specified) "
                        "(select one: %(choices)s)."
            },
            '-ref': {
                'metavar': ('<gene>', '<fasta>'),
                'type': 'str',
                'action': 'append',
                'nargs': 2,
                'required': ('-cdr3' not in sys.argv and '-custom-model' in sys.argv),
                'help': "A gene (V, D or J) followed by a reference genome FASTA file. Note: the FASTA reference genome files "
                        "needs to conform to IGMT annotation (separated by '|' character). (required for -custom-model "
                        "without -cdr3)"
            },
            '-type': {
                'type': 'str.lower',
                'choices': ['alpha', 'beta', 'light', 'heavy'],
                'required': ('-custom-model' in sys.argv),
                'help': 'The type of the custom model to use. (select one: %(choices)s) (required for -custom-model).'
            },
            '-custom-model': {
                'metavar': ('<parameters>', '<marginals>'),
                'type': 'str',
                'nargs': 2,
                'help': 'A IGoR parameters file followed by an IGoR marginals file.'
            },
            '-anchor': {
                'metavar': ('<gene>', '<separated>'),
                'type': 'str',
                'action': 'append',
                'nargs': 2,
                'required': ('-cdr3' in sys.argv and '-custom-model' in sys.argv),
                'help': 'A gene (V or J) followed by a CDR3 anchor separated data file. Note: need to contain gene in the '
                        'first column, anchor index in the second and gene function in the third (required for -cdr3 and '
                        '-custom-model).'
            },
            '-cdr3': {
                'action': 'store_true',
                'help': 'If specified (True), CDR3 sequences should be evaluated, otherwise V(D)J sequences (default: {}).'
                        .format(get_config_data('EVALUATE', 'EVAL_CDR3', 'bool'))
            },
            '-use-allele': {
                'action': 'store_true',
                'help': "If specified (True), in combination with the '-cdr3' flag, the allele information from the gene "
                        "choice fields is used to calculate the generation probability (default: {})."
                        .format(get_config_data('EVALUATE', 'USE_ALLELE', 'bool'))
            },
        }

        # Add the options to the parser and return the updated parser.
        parser_tool = self.subparsers.add_parser('evaluate', help=description, description=description)
        parser_tool = dynamic_cli_options(parser=parser_tool, options=parser_options)
Esempio n. 5
0
    def run(self, args, output_dir):
        """Function to execute the commandline tool.

        Parameters
        ----------
        args : Namespace
            Object containing our parsed commandline arguments.
        output_dir : str
            A directory path for writing output files to.

        """
        eval_cdr3 = get_config_data('EVALUATE', 'EVAL_CDR3', 'bool')
        if args.cdr3:
            eval_cdr3 = args.cdr3

        # If the given type of sequences evaluation is VDJ, use IGoR.
        if not eval_cdr3:

            # Add general IGoR commands.
            self.logger.info('Setting up initial IGoR command (1/4)')
            command_list = []
            working_dir = get_config_data('COMMON', 'WORKING_DIR')
            command_list.append(['set_wd', working_dir])
            command_list.append(['threads', str(get_config_data('COMMON', 'NUM_THREADS', 'int'))])

            # Add the model (build-in or custom) command depending on given.
            self.logger.info('Processing genomic reference templates (2/4)')
            try:
                if args.model:
                    files = get_default_model_file_paths(name=args.model)
                    model_type = files['type']
                    command_list.append([
                        'set_custom_model',
                        files['parameters'],
                        files['marginals']
                    ])
                    ref_list = ['set_genomic']
                    for gene, filename in files['reference'].items():
                        ref_list.append([gene, filename])
                    command_list.append(ref_list)
                elif args.custom_model:
                    model_type = args.type
                    command_list.append([
                        'set_custom_model',
                        copy_to_dir(working_dir, str(args.custom_model[0]), 'txt'),
                        copy_to_dir(working_dir, str(args.custom_model[1]), 'txt'),
                    ])
                    ref_list = ['set_genomic']
                    for i in args.ref:
                        filename = preprocess_reference_file(
                            os.path.join(working_dir, 'genomic_templates'),
                            copy_to_dir(working_dir, i[1], 'fasta'),
                            1
                        )
                        ref_list.append([i[0], filename])
                    command_list.append(ref_list)
            except IOError as err:
                self.logger.error(str(err))
                return

            # Add the sequence command after pre-processing of the input file.
            self.logger.info('Pre-processing input sequence file (3/4)')
            try:
                if is_fasta(args.seqs):
                    self.logger.info('FASTA input file extension detected')
                    command_list.append([
                        'read_seqs',
                        copy_to_dir(working_dir, str(args.seqs), 'fasta')
                    ])
                elif is_separated(args.seqs, get_config_data('COMMON', 'SEPARATOR')):
                    self.logger.info('Separated input file type detected')
                    input_seqs = preprocess_separated_file(
                        os.path.join(working_dir, 'input'),
                        copy_to_dir(working_dir, str(args.seqs), 'csv'),
                        get_config_data('COMMON', 'SEPARATOR'),
                        ';',
                        get_config_data('COMMON', 'I_COL'),
                        [get_config_data('COMMON', 'NT_COL')]
                    )
                    command_list.append(['read_seqs', input_seqs])
                else:
                    self.logger.error(
                        'Given input sequence file could not be detected as '
                        'FASTA file or separated data type')
                    return
            except (IOError, KeyError, ValueError) as err:
                self.logger.error(str(err))
                return

            # Add alignment and evealuation commands.
            self.logger.info('Adding additional variables to IGoR command (4/4)')
            command_list.append(['align', ['all']])
            command_list.append(['evaluate'])
            command_list.append(['output', ['Pgen']])

            # Execute IGoR through command line and catch error code.
            self.logger.info('Executing IGoR (this might take a while)')
            try:
                igor_cline = IgorInterface(command=command_list)
                exit_code, _, stderr, _ = igor_cline.call()
                if exit_code != 0:
                    self.logger.error(
                        "An error occurred during execution of IGoR command "
                        "(exit code %s):\n%s", exit_code, stderr)
                    return
            except OSError as err:
                self.logger.error(str(err))
                return

            # Read in all data frame files, based on input file type.
            self.logger.info('Processing generation probabilities')
            try:
                if is_fasta(args.seqs):
                    seqs_df = read_fasta_as_dataframe(
                        file=args.seqs,
                        col=get_config_data('COMMON', 'NT_COL'))
                elif is_separated(args.seqs, get_config_data('COMMON', 'SEPARATOR')):
                    seqs_df = read_separated_to_dataframe(
                        file=args.seqs,
                        separator=get_config_data('COMMON', 'SEPARATOR'),
                        index_col=get_config_data('COMMON', 'I_COL'))
                full_pgen_df = read_separated_to_dataframe(
                    file=os.path.join(working_dir, 'output', 'Pgen_counts.csv'),
                    separator=';',
                    index_col='seq_index',
                    cols=['Pgen_estimate'])
                full_pgen_df.index.names = [get_config_data('COMMON', 'I_COL')]
                full_pgen_df.rename(
                    columns={'Pgen_estimate': get_config_data('COMMON', 'NT_P_COL')},
                    inplace=True)
                full_pgen_df.loc[:, get_config_data('COMMON', 'AA_P_COL')] = numpy.nan
            except (IOError, KeyError, ValueError) as err:
                self.logger.error(str(err))
                return

            # Insert amino acid sequence column if not existent.
            self.logger.info('Formatting output dataframe')
            if (get_config_data('COMMON', 'NT_COL') in seqs_df.columns
                    and not get_config_data('COMMON', 'AA_COL') in seqs_df.columns):
                seqs_df.insert(
                    seqs_df.columns.get_loc(get_config_data('COMMON', 'NT_COL')) + 1,
                    get_config_data('COMMON', 'AA_COL'), numpy.nan)
                seqs_df[get_config_data('COMMON', 'AA_COL')] = seqs_df[get_config_data('COMMON', 'NT_COL')].apply(nucleotides_to_aminoacids)

            # Merge IGoR generated sequence output dataframes.
            full_pgen_df = seqs_df.merge(full_pgen_df, left_index=True, right_index=True)

            # Write the pandas dataframe to a separated file.
            try:
                self.logger.info('Writing evaluated data to file system')
                output_filename = get_config_data('COMMON', 'OUT_NAME')
                if not output_filename:
                    output_filename = 'pgen_estimate_{}'.format(model_type)
                _, filename = write_dataframe_to_separated(
                    dataframe=full_pgen_df,
                    filename=output_filename,
                    directory=output_dir,
                    separator=get_config_data('COMMON', 'SEPARATOR'),
                    index_name=get_config_data('COMMON', 'I_COL'))
                self.logger.info("Written '%s'", filename)
            except IOError as err:
                self.logger.error(str(err))
                return

        # If the given type of sequences evaluation is CDR3, use OLGA.
        elif eval_cdr3:

            # Create the directory for the output files.
            working_dir = os.path.join(get_config_data('COMMON', 'WORKING_DIR'), 'output')
            if not os.path.isdir(working_dir):
                os.makedirs(os.path.join(get_config_data('COMMON', 'WORKING_DIR'), 'output'))

            # Load the model and create the sequence evaluator.
            self.logger.info('Loading the IGoR model files')
            try:
                if args.model:
                    files = get_default_model_file_paths(name=args.model)
                    model_type = files['type']
                    model = IgorLoader(model_type=model_type,
                                       model_params=files['parameters'],
                                       model_marginals=files['marginals'])
                    args.anchor = [['V', files['v_anchors']],
                                   ['J', files['j_anchors']]]
                    separator = '\t'
                elif args.custom_model:
                    model_type = args.type
                    model = IgorLoader(model_type=model_type,
                                       model_params=args.custom_model[0],
                                       model_marginals=args.custom_model[1])
                    separator = get_config_data('COMMON', 'SEPARATOR')
                for gene in args.anchor:
                    anchor_file = preprocess_separated_file(
                        os.path.join(working_dir, 'cdr3_anchors'),
                        str(gene[1]),
                        separator,
                        ','
                    )
                    model.set_anchor(gene=gene[0], file=anchor_file)
                model.initialize_model()
            except (TypeError, OSError, IOError, KeyError, ValueError) as err:
                self.logger.error(str(err))
                return

            # Based on input file type, load in input file.
            self.logger.info('Pre-processing input sequence file')
            try:
                if is_fasta(args.seqs):
                    self.logger.info('FASTA input file extension detected')
                    seqs_df = read_fasta_as_dataframe(
                        file=args.seqs,
                        col=get_config_data('COMMON', 'NT_COL'))
                elif is_separated(args.seqs, get_config_data('COMMON', 'SEPARATOR')):
                    self.logger.info('Separated input file type detected')
                    seqs_df = read_separated_to_dataframe(
                        file=args.seqs,
                        separator=get_config_data('COMMON', 'SEPARATOR'),
                        index_col=get_config_data('COMMON', 'I_COL'))
                else:
                    self.logger.error('Given input sequence file could not be detected as FASTA file or separated data type')
                    return
            except (IOError, KeyError, ValueError) as err:
                self.logger.error(str(err))
                return

            # Evaluate the sequences.
            self.logger.info('Evaluating sequences')
            try:
                use_allele = get_config_data('EVALUATE', 'USE_ALLELE', 'bool')
                if args.use_allele:
                    use_allele = args.use_allele
                seq_evaluator = OlgaContainer(
                    igor_model=model,
                    nt_col=get_config_data('COMMON', 'NT_COL'),
                    nt_p_col=get_config_data('COMMON', 'NT_P_COL'),
                    aa_col=get_config_data('COMMON', 'AA_COL'),
                    aa_p_col=get_config_data('COMMON', 'AA_P_COL'),
                    v_gene_choice_col=get_config_data('COMMON', 'V_GENE_CHOICE_COL'),
                    j_gene_choice_col=get_config_data('COMMON', 'J_GENE_CHOICE_COL'))
                cdr3_pgen_df = seq_evaluator.evaluate(
                    seqs=seqs_df,
                    num_threads=get_config_data('COMMON', 'NUM_THREADS', 'int'),
                    use_allele=use_allele,
                    default_allele=get_config_data('EVALUATE', 'DEFAULT_ALLELE'))

                # Merge IGoR generated sequence output dataframes.
                cdr3_pgen_df = seqs_df.merge(cdr3_pgen_df, left_index=True, right_index=True)
            except (TypeError, IOError) as err:
                self.logger.error(str(err))
                return

            # Write the pandas dataframe to a separated file.
            try:
                self.logger.info('Writing evaluated data to file system')
                output_filename = get_config_data('COMMON', 'OUT_NAME')
                if not output_filename:
                    output_filename = 'pgen_estimate_{}_CDR3'.format(model_type)
                _, filename = write_dataframe_to_separated(
                    dataframe=cdr3_pgen_df,
                    filename=output_filename,
                    directory=output_dir,
                    separator=get_config_data('COMMON', 'SEPARATOR'),
                    index_name=get_config_data('COMMON', 'I_COL'))
                self.logger.info("Written '%s'", filename)
            except IOError as err:
                self.logger.error(str(err))
                return
Esempio n. 6
0
    def run(self, args, output_dir):
        """Function to execute the commandline tool.

        Parameters
        ----------
        args : Namespace
            Object containing our parsed commandline arguments.
        output_dir : str
            A directory path for writing output files to.

        """
        # Get the working directory.
        working_dir = get_config_data('COMMON', 'WORKING_DIR')

        # Create the alignment and locate the motifs.
        for gene in args.ref:
            self.logger.info(
                'Processing genomic reference template for %s and building MUSCLE alignment',
                gene[0])
            try:
                filename = preprocess_reference_file(
                    os.path.join(working_dir, 'genomic_templates'),
                    copy_to_dir(working_dir, gene[1], 'fasta'),
                )
                aligner = MuscleAligner(infile=filename)
                locator = AnchorLocator(
                    alignment=aligner.get_muscle_alignment(), gene=gene[0])
            except (OSError, ValueError, IOError) as err:
                self.logger.error(str(err))
                return

            try:
                self.logger.info('Locating CDR3 anchors for %s', gene[0])
                if args.motif is not None:
                    anchors_df = locator.get_indices_motifs(
                        get_config_data('COMMON', 'NUM_THREADS', 'int'),
                        *args.motif)
                else:
                    if gene[0] == 'V':
                        anchors_df = locator.get_indices_motifs(
                            get_config_data('COMMON', 'NUM_THREADS', 'int'),
                            *get_config_data('LOCATE', 'V_MOTIFS').split(','))
                    elif gene[0] == 'J':
                        anchors_df = locator.get_indices_motifs(
                            get_config_data('COMMON', 'NUM_THREADS', 'int'),
                            *get_config_data('LOCATE', 'J_MOTIFS').split(','))
            except ValueError as err:
                self.logger.error(str(err))
                return

            # Modify the dataframe to make it OLGA compliant.
            self.logger.info('Formatting CDR3 anchor dataframe')
            try:
                anchors_df.insert(2, 'function', numpy.nan)
                anchors_df.rename(columns={'name': 'gene'}, inplace=True)
                anchors_df['gene'], anchors_df['function'] = zip(
                    *anchors_df['gene'].apply(lambda value: (value.split('|')[
                        1], value.split('|')[3])))
            except (IndexError, ValueError):
                self.logger.error(
                    "FASTA header needs to be separated by '|', needs to have gene name on index position 1 and function "
                    "on index position 3: '%s'", anchors_df['gene'])
                return

            # Write the pandas dataframe to a separated file with prefix.
            try:
                self.logger.info('Writing CDR3 acnhors for %s to system',
                                 gene[0])
                output_prefix = get_config_data('COMMON', 'OUT_NAME')
                if not output_prefix:
                    output_prefix = 'gene_CDR3_anchors'
                _, filename = write_dataframe_to_separated(
                    dataframe=anchors_df,
                    filename='{}_{}'.format(gene[0], output_prefix),
                    directory=output_dir,
                    separator=get_config_data('COMMON', 'SEPARATOR'))
                self.logger.info("Written '%s' for %s gene", filename, gene[0])
            except IOError as err:
                self.logger.error(str(err))
                return
Esempio n. 7
0
    def _add_options(self):
        """Function for adding the parser and options to the given ArgumentParser.

        Notes
        -----
            Uses the class constructor's subparser object for appending the
            tool's parser and options.

        """
        # Create the description and options for the parser.
        description = "Generate VDJ or VJ sequences given a custom IGoR model (or build-in) by executing IGoR's " \
            "commandline tool via python subprocess. Or generate CDR3 sequences from the model by using OLGA."
        parser_options = {
            '-model': {
                'type':
                'str.lower',
                'choices':
                get_default_model_file_paths(),
                'required':
                '-custom-model' not in sys.argv,
                'help':
                "Specify a pre-installed model for generation. (required if -custom-model NOT specified) "
                "(select one: %(choices)s)."
            },
            '-type': {
                'type':
                'str.lower',
                'choices': ['alpha', 'beta', 'light', 'heavy'],
                'required': ('-custom-model' in sys.argv),
                'help':
                'The type of the custom model to use. (select one: %(choices)s) (required for -custom-model).'
            },
            '-anchor': {
                'metavar': ('<gene>', '<separated>'),
                'type':
                'str',
                'action':
                'append',
                'nargs':
                2,
                'required': ('-cdr3' in sys.argv
                             and '-custom-model' in sys.argv),
                'help':
                'A gene (V or J) followed by a CDR3 anchor separated data file. Note: need to contain gene in the '
                'first column, anchor index in the second and gene function in the third (required for -cdr3 and '
                '-custom-model).'
            },
            '-custom-model': {
                'metavar': ('<parameters>', '<marginals>'),
                'type':
                'str',
                'nargs':
                2,
                'help':
                'A IGoR parameters file followed by an IGoR marginals file.'
            },
            '-n-gen': {
                'type':
                'int',
                'nargs':
                '?',
                'help':
                'The number of sequences to generate (default: {}).'.format(
                    get_config_data('GENERATE', 'NUM_GENERATE', 'int'))
            },
            '-cdr3': {
                'action':
                'store_true',
                'help':
                'If specified (True), CDR3 sequences are generated, otherwise V(D)J sequences (default: {}).'
                .format(get_config_data('GENERATE', 'EVAL_CDR3', 'bool'))
            },
        }

        # Add the options to the parser and return the updated parser.
        parser_tool = self.subparsers.add_parser('generate',
                                                 help=description,
                                                 description=description)
        parser_tool = dynamic_cli_options(parser=parser_tool,
                                          options=parser_options)
Esempio n. 8
0
    def run(self, args, output_dir):
        """Function to execute the commandline tool.

        Parameters
        ----------
        args : Namespace
            Object containing our parsed commandline arguments.
        output_dir : str
            A directory path for writing output files to.

        """
        eval_cdr3 = get_config_data('GENERATE', 'EVAL_CDR3', 'bool')
        if args.cdr3:
            eval_cdr3 = args.cdr3

        # If the given type of sequences generation is not CDR3, use IGoR.
        if not eval_cdr3:

            # Add general igor commands.
            self.logger.info('Setting up initial IGoR command (1/3)')
            command_list = []
            working_dir = get_config_data('COMMON', 'WORKING_DIR')
            command_list.append(['set_wd', working_dir])
            command_list.append([
                'threads',
                str(get_config_data('COMMON', 'NUM_THREADS', 'int'))
            ])

            # Add the model (build-in or custom) command.
            self.logger.info('Processing IGoR model files (2/3)')
            try:
                if args.model:
                    files = get_default_model_file_paths(name=args.model)
                    command_list.append([
                        'set_custom_model', files['parameters'],
                        files['marginals']
                    ])
                elif args.custom_model:
                    command_list.append([
                        'set_custom_model',
                        copy_to_dir(working_dir, str(args.custom_model[0]),
                                    'txt'),
                        copy_to_dir(working_dir, str(args.custom_model[1]),
                                    'txt')
                    ])
            except IOError as err:
                self.logger.error(str(err))
                return

            # Add generate command.
            self.logger.info(
                'Adding additional variables to IGoR command (3/3)')
            if args.n_gen:
                command_list.append(['generate', str(args.n_gen), ['noerr']])
            else:
                command_list.append([
                    'generate',
                    str(get_config_data('GENERATE', 'NUM_GENERATE', 'int')),
                    ['noerr']
                ])

            # Execute IGoR through command line and catch error code.
            self.logger.info('Executing IGoR (this might take a while)')
            try:
                igor_cline = IgorInterface(command=command_list)
                exit_code, _, stderr, _ = igor_cline.call()
                if exit_code != 0:
                    self.logger.error(
                        "An error occurred during execution of IGoR command (exit code %s):\n%s",
                        exit_code, stderr)
                    return
            except OSError as err:
                self.logger.error(str(err))
                return

            # Merge the generated output files together (translated).
            self.logger.info('Processing sequence realizations')
            try:
                seqs_df = read_separated_to_dataframe(file=os.path.join(
                    working_dir, 'generated', 'generated_seqs_noerr.csv'),
                                                      separator=';',
                                                      index_col='seq_index',
                                                      cols=['nt_sequence'])
                seqs_df.index.names = [get_config_data('COMMON', 'I_COL')]
                seqs_df.columns = [get_config_data('COMMON', 'NT_COL')]
                seqs_df[get_config_data('COMMON', 'AA_COL')] = \
                    seqs_df[get_config_data('COMMON', 'NT_COL')].apply(nucleotides_to_aminoacids)
                real_df = read_separated_to_dataframe(file=os.path.join(
                    working_dir, 'generated',
                    'generated_realizations_noerr.csv'),
                                                      separator=';',
                                                      index_col='seq_index')
                real_df.index.names = [get_config_data('COMMON', 'I_COL')]
                if args.model:
                    files = get_default_model_file_paths(name=args.model)
                    model_type = files['type']
                    model = IgorLoader(model_type=model_type,
                                       model_params=files['parameters'],
                                       model_marginals=files['marginals'])
                elif args.custom_model:
                    model_type = args.type
                    model = IgorLoader(model_type=model_type,
                                       model_params=args.custom_model[0],
                                       model_marginals=args.custom_model[1])
                real_df = self._process_realizations(
                    data=real_df,
                    model=model,
                    v_gene_choice_col=get_config_data('COMMON',
                                                      'V_GENE_CHOICE_COL'),
                    d_gene_choice_col=get_config_data('COMMON',
                                                      'D_GENE_CHOICE_COL'),
                    j_gene_choice_col=get_config_data('COMMON',
                                                      'J_GENE_CHOICE_COL'))
                full_seqs_df = seqs_df.merge(real_df,
                                             left_index=True,
                                             right_index=True)
            except (IOError, KeyError, ValueError) as err:
                self.logger.error(str(err))
                return

            # Write the pandas dataframe to a separated file.
            try:
                self.logger.info('Writing generated sequences to file system')
                output_filename = get_config_data('COMMON', 'OUT_NAME')
                if not output_filename:
                    output_filename = 'generated_seqs_{}'.format(model_type)
                _, filename = write_dataframe_to_separated(
                    dataframe=full_seqs_df,
                    filename=output_filename,
                    directory=output_dir,
                    separator=get_config_data('COMMON', 'SEPARATOR'),
                    index_name=get_config_data('COMMON', 'I_COL'))
                self.logger.info("Written '%s'", filename)
            except IOError as err:
                self.logger.error(str(err))
                return

        # If the given type of sequences generation is CDR3, use OLGA.
        elif eval_cdr3:

            # Get the working directory.
            working_dir = get_config_data('COMMON', 'WORKING_DIR')

            # Load the model, create the sequence generator and generate the sequences.
            self.logger.info('Loading the IGoR model files')
            try:
                if args.model:
                    files = get_default_model_file_paths(name=args.model)
                    model_type = files['type']
                    model = IgorLoader(model_type=model_type,
                                       model_params=files['parameters'],
                                       model_marginals=files['marginals'])
                    args.anchor = [['V', files['v_anchors']],
                                   ['J', files['j_anchors']]]
                    separator = '\t'
                elif args.custom_model:
                    model_type = args.type
                    model = IgorLoader(model_type=model_type,
                                       model_params=args.custom_model[0],
                                       model_marginals=args.custom_model[1])
                    separator = get_config_data('COMMON', 'SEPARATOR')
                for gene in args.anchor:
                    anchor_file = preprocess_separated_file(
                        os.path.join(working_dir, 'cdr3_anchors'),
                        str(gene[1]), separator, ',')
                    model.set_anchor(gene=gene[0], file=anchor_file)
                model.initialize_model()
            except (TypeError, OSError, IOError, KeyError, ValueError) as err:
                self.logger.error(str(err))
                return

            # Setup the sequence generator and generate sequences.
            self.logger.info('Generating sequences')
            try:
                seq_generator = OlgaContainer(
                    igor_model=model,
                    nt_col=get_config_data('COMMON', 'NT_COL'),
                    nt_p_col=get_config_data('COMMON', 'NT_P_COL'),
                    aa_col=get_config_data('COMMON', 'AA_COL'),
                    aa_p_col=get_config_data('COMMON', 'AA_P_COL'),
                    v_gene_choice_col=get_config_data('COMMON',
                                                      'V_GENE_CHOICE_COL'),
                    j_gene_choice_col=get_config_data('COMMON',
                                                      'J_GENE_CHOICE_COL'))
                n_generate = get_config_data('GENERATE', 'NUM_GENERATE', 'int')
                if args.n_gen:
                    n_generate = args.n_gen
                if n_generate > 0:
                    cdr3_seqs_df = seq_generator.generate(num_seqs=n_generate)
                else:
                    self.logger.error(
                        'Number of sequences to generate should be higher 0')
                    return
            except (TypeError, IOError) as err:
                self.logger.error(str(err))
                return

            # Write the pandas dataframe to a separated file with.
            try:
                self.logger.info('Writing generated sequences to file system')
                output_filename = get_config_data('COMMON', 'OUT_NAME')
                if not output_filename:
                    output_filename = 'generated_seqs_{}_CDR3'.format(
                        model_type)
                _, filename = write_dataframe_to_separated(
                    dataframe=cdr3_seqs_df,
                    filename=output_filename,
                    directory=output_dir,
                    separator=get_config_data('COMMON', 'SEPARATOR'),
                    index_name=get_config_data('COMMON', 'I_COL'))
                self.logger.info("Written '%s'", filename)
            except IOError as err:
                self.logger.error(str(err))
                return
Esempio n. 9
0
    def run(self, args, output_dir):
        """Function to execute the commandline tool.

        Parameters
        ----------
        args : Namespace
            Object containing our parsed commandline arguments.
        output_dir : str
            A directory path for writing output files to.

        """
        # Add general igor commands.
        self.logger.info('Setting up initial IGoR command (1/5)')
        command_list = []
        working_dir = get_config_data('COMMON', 'WORKING_DIR')
        command_list.append(['set_wd', working_dir])
        command_list.append(
            ['threads',
             str(get_config_data('COMMON', 'NUM_THREADS', 'int'))])

        # Add sequence and file paths commands.
        self.logger.info('Processing genomic reference templates (2/5)')
        try:
            ref_list = ['set_genomic']
            for i in args.ref:
                filename = preprocess_reference_file(
                    os.path.join(working_dir, 'genomic_templates'),
                    copy_to_dir(working_dir, i[1], 'fasta'), 1)
                ref_list.append([i[0], filename])
            command_list.append(ref_list)
        except IOError as err:
            self.logger.error(str(err))
            return

        # Set the initial model parameters using a build-in model.
        self.logger.info('Setting initial model parameters (3/5)')
        if args.type in ['beta', 'heavy']:
            command_list.append([
                'set_custom_model',
                get_default_model_file_paths(name='human-t-beta')['parameters']
            ])
        elif args.type in ['alpha', 'light']:
            command_list.append([
                'set_custom_model',
                get_default_model_file_paths(
                    name='human-t-alpha')['parameters']
            ])

        # Add the sequence command after pre-processing of the input file.
        self.logger.info('Pre-processing input sequence file (4/5)')
        try:
            if is_fasta(args.seqs):
                self.logger.info('FASTA input file extension detected')
                command_list.append([
                    'read_seqs',
                    copy_to_dir(working_dir, str(args.seqs), 'fasta')
                ])
            elif is_separated(args.seqs,
                              get_config_data('COMMON', 'SEPARATOR')):
                self.logger.info('Separated input file type detected')
                try:
                    input_seqs = preprocess_separated_file(
                        os.path.join(working_dir, 'input'),
                        copy_to_dir(working_dir, str(args.seqs), 'csv'),
                        get_config_data('COMMON', 'SEPARATOR'), ';',
                        get_config_data('COMMON', 'I_COL'),
                        [get_config_data('COMMON', 'NT_COL')])
                    command_list.append(['read_seqs', input_seqs])
                except (KeyError, ValueError) as err:
                    self.logger.error(
                        "Given input sequence file does not have a '%s' column",
                        get_config_data('COMMON', 'NT_COL'))
                    return
            else:
                self.logger.error(
                    'Given input sequence file could not be detected as '
                    'FASTA file or separated data type')
                return
        except (IOError, KeyError) as err:
            self.logger.error(str(err))
            return

        # Add alignment command and inference commands.
        self.logger.info('Adding additional variables to IGoR command (5/5)')
        command_list.append(['align', ['all']])
        if args.n_iter:
            command_list.append(['infer', ['N_iter', str(args.n_iter)]])
        else:
            command_list.append([
                'infer',
                [
                    'N_iter',
                    str(get_config_data('BUILD', 'NUM_ITERATIONS', 'int'))
                ]
            ])

        # Execute IGoR through command line and catch error code.
        self.logger.info('Executing IGoR (this might take a while)')
        try:
            igor_cline = IgorInterface(command=command_list)
            exit_code, _, stderr, _ = igor_cline.call()
            if exit_code != 0:
                self.logger.error(
                    "An error occurred during execution of IGoR command "
                    "(exit code %s):\n%s", exit_code, stderr)
                return
        except OSError as err:
            self.logger.error(str(err))
            return

        # Copy the output files to the output directory with prefix.
        try:
            self.logger.info('Writing model files to file system')
            output_prefix = get_config_data('COMMON', 'OUT_NAME')
            if not output_prefix:
                output_prefix = 'model'
            _, filename_1 = self._copy_file_to_output(
                file=os.path.join(working_dir, 'inference',
                                  'final_marginals.txt'),
                filename='{}_marginals'.format(output_prefix),
                directory=output_dir)
            self.logger.info("Written '%s'", filename_1)
            _, filename_2 = self._copy_file_to_output(
                file=os.path.join(working_dir, 'inference', 'final_parms.txt'),
                filename='{}_params'.format(output_prefix),
                directory=output_dir)
            self.logger.info("Written '%s'", filename_2)
            for file in os.listdir(os.path.join(working_dir, 'inference')):
                _, filename = self._copy_file_to_output(
                    file=os.path.join(working_dir, 'inference', file),
                    filename='{}_{}'.format(output_prefix,
                                            file.split('.')[0]),
                    directory=output_dir)
                self.logger.info("Written '%s'", filename)
        except IOError as err:
            self.logger.error(str(err))
            return
Esempio n. 10
0
    def _add_options(self):
        """Function for adding the parser/options to the input ArgumentParser.

        Notes
        -----
            Uses the class constructor's subparser object for appending the tool's parser and options.

        """
        # Create the description and options for the parser.
        description = "Converts the full length (VDJ for productive, unproductive and the total) and CDR3 sequences from a " \
            "given adaptive input sequence file. The VDJ sequences can be used to build a new IGoR model and the CDR3 " \
            "sequences can be evaluated."
        parser_options = {
            '-seqs': {
                'metavar':
                '<separated>',
                'required':
                'True',
                'type':
                'str',
                'help':
                "An input separated data file with sequences to convert using the defined column names."
            },
            '-ref': {
                'metavar': ('<gene>', '<fasta>'),
                'type':
                'str',
                'action':
                'append',
                'nargs':
                2,
                'required':
                'True',
                'help':
                "A gene (V or J) followed by a reference genome FASTA file. Note: the FASTA reference genome files "
                "needs to conform to IGMT annotation (separated by '|' character)."
            },
            '-n-random': {
                'type':
                'int',
                'nargs':
                '?',
                'help':
                "Number of random sequences (subset) to convert from the given file (default: {})."
                .format(get_config_data('CONVERT', 'NUM_RANDOM', 'int'))
            },
            '-use-allele': {
                'action':
                'store_true',
                'help':
                "If specified (True), the allele information from the resolved gene fields are used to when "
                "reconstructing the gene choices (default: {}).".format(
                    get_config_data('CONVERT', 'USE_ALLELE', 'bool'))
            },
        }

        # Add the options to the parser and return the updated parser.
        parser_tool = self.subparsers.add_parser('convert',
                                                 help=description,
                                                 description=description)
        parser_tool = dynamic_cli_options(parser=parser_tool,
                                          options=parser_options)
Esempio n. 11
0
    def run(self, args, output_dir):
        """Function to execute the commandline tool.

        Parameters
        ----------
        args : Namespace
            Object containing our parsed commandline arguments.
        output_dir : str
            A directory path for writing output files to.

        """
        # Get the working directory.
        working_dir = get_config_data('COMMON', 'WORKING_DIR')

        # Collect and read in the corresponding reference genomic templates.
        self.logger.info('Processing genomic reference templates')
        try:
            for gene in args.ref:
                filename = preprocess_reference_file(
                    os.path.join(working_dir, 'genomic_templates'),
                    copy_to_dir(working_dir, gene[1], 'fasta'),
                )
                if gene[0] == 'V':
                    v_gene_df = self._process_gene_df(
                        filename=filename,
                        nt_col=get_config_data('COMMON', 'NT_COL'),
                        resolved_col=get_config_data('COMMON',
                                                     'V_RESOLVED_COL'))
                if gene[0] == 'J':
                    j_gene_df = self._process_gene_df(
                        filename=filename,
                        nt_col=get_config_data('COMMON', 'NT_COL'),
                        resolved_col=get_config_data('COMMON',
                                                     'J_RESOLVED_COL'))
        except (IOError, KeyError, ValueError) as err:
            self.logger.error(str(err))
            return

        # Read in the sequence data.
        self.logger.info('Pre-processing input sequence file')
        try:
            seqs_df = read_separated_to_dataframe(
                file=args.seqs,
                separator=get_config_data('COMMON', 'SEPARATOR'),
                cols=[
                    get_config_data('COMMON', 'NT_COL'),
                    get_config_data('COMMON', 'AA_COL'),
                    get_config_data('COMMON', 'FRAME_TYPE_COL'),
                    get_config_data('COMMON', 'CDR3_LENGTH_COL'),
                    get_config_data('COMMON', 'V_RESOLVED_COL'),
                    get_config_data('COMMON', 'J_RESOLVED_COL')
                ])

            # Take a random subsample of sequences in the file.
            n_random = get_config_data('CONVERT', 'NUM_RANDOM', 'int')
            if args.n_random:
                n_random = args.n_random
            if n_random != 0:
                if len(seqs_df) < n_random:
                    self.logger.warning(
                        'Number of random sequences is higher then number of '
                        'rows in file, all rows are used')
                    return
        except (IOError, KeyError, ValueError) as err:
            self.logger.error(str(err))
            return

        # Setup the data convertor class and convert data.
        self.logger.info('Converting adaptive file format')
        try:
            use_allele = get_config_data('CONVERT', 'USE_ALLELE', 'bool')
            if args.use_allele:
                use_allele = args.use_allele
            asc = AdaptiveSequenceConvertor()
            cdr3_df, full_prod_df, full_unprod_df, full_df = asc.convert(
                num_threads=get_config_data('COMMON', 'NUM_THREADS', 'int'),
                seqs=seqs_df,
                ref_v_genes=v_gene_df,
                ref_j_genes=j_gene_df,
                row_id_col=get_config_data('COMMON', 'ROW_ID_COL'),
                nt_col=get_config_data('COMMON', 'NT_COL'),
                aa_col=get_config_data('COMMON', 'AA_COL'),
                frame_type_col=get_config_data('COMMON', 'FRAME_TYPE_COL'),
                cdr3_length_col=get_config_data('COMMON', 'CDR3_LENGTH_COL'),
                v_resolved_col=get_config_data('COMMON', 'V_RESOLVED_COL'),
                v_gene_choice_col=get_config_data('COMMON',
                                                  'V_GENE_CHOICE_COL'),
                j_resolved_col=get_config_data('COMMON', 'J_RESOLVED_COL'),
                j_gene_choice_col=get_config_data('COMMON',
                                                  'J_GENE_CHOICE_COL'),
                use_allele=use_allele,
                default_allele=get_config_data('CONVERT', 'DEFAULT_ALLELE'),
                n_random=n_random)
            cdr3_df.insert(0, get_config_data('COMMON', 'FILE_NAME_ID_COL'),
                           os.path.splitext(os.path.basename(args.seqs))[0])
            full_prod_df.insert(
                0, get_config_data('COMMON', 'FILE_NAME_ID_COL'),
                os.path.splitext(os.path.basename(args.seqs))[0])
            full_unprod_df.insert(
                0, get_config_data('COMMON', 'FILE_NAME_ID_COL'),
                os.path.splitext(os.path.basename(args.seqs))[0])
            full_df.insert(0, get_config_data('COMMON', 'FILE_NAME_ID_COL'),
                           os.path.splitext(os.path.basename(args.seqs))[0])
        except KeyError as err:
            self.logger.error(str(err))
            return

        # Copy the output files to the output directory with prefix.
        try:
            self.logger.info('Writing converted files to file system')
            output_prefix = get_config_data('COMMON', 'OUT_NAME')
            if not output_prefix:
                output_prefix = 'converted'
            _, filename_1 = write_dataframe_to_separated(
                dataframe=cdr3_df,
                filename='{}_CDR3'.format(output_prefix),
                directory=output_dir,
                separator=get_config_data('COMMON', 'SEPARATOR'),
                index_name=get_config_data('COMMON', 'I_COL'))
            self.logger.info("Written '%s'", filename_1)
            _, filename_2 = write_dataframe_to_separated(
                dataframe=full_prod_df,
                filename='{}_full_length_productive'.format(output_prefix),
                directory=output_dir,
                separator=get_config_data('COMMON', 'SEPARATOR'),
                index_name=get_config_data('COMMON', 'I_COL'))
            self.logger.info("Written '%s'", filename_2)
            _, filename_3 = write_dataframe_to_separated(
                dataframe=full_unprod_df,
                filename='{}_full_length_unproductive'.format(output_prefix),
                directory=output_dir,
                separator=get_config_data('COMMON', 'SEPARATOR'),
                index_name=get_config_data('COMMON', 'I_COL'))
            self.logger.info("Written '%s'", filename_3)
            _, filename_4 = write_dataframe_to_separated(
                dataframe=full_df,
                filename='{}_full_length'.format(output_prefix),
                directory=output_dir,
                separator=get_config_data('COMMON', 'SEPARATOR'),
                index_name=get_config_data('COMMON', 'I_COL'))
            self.logger.info("Written '%s'", filename_4)
        except IOError as err:
            self.logger.error(str(err))
            return