def __init__(self, input_file_paths, genes_table_structure): files_expected = { 'functions': 'functions.tbl', 'gene_otus': 'gene_otus.tbl', 'peg': 'peg.tbl' } files_structure = { 'functions': { 'col_names': ['prot', 'figfam', 'field3', 'field4', 'field5', 'function'], 'col_mapping': [str, str, int, int, int, str] }, 'gene_otus': { 'col_names': ['prot', 't_species'], 'col_mapping': None }, 'peg': { 'col_names': ['prot', 'contig', 'start', 'stop'], 'col_mapping': [str, str, int, int] }, } self.genes_table_structure = genes_table_structure Parser.__init__(self, 'MyRastGUI', input_file_paths, files_expected, files_structure)
def __init__(self, input_files, taxonomy_table_structure, run=terminal.Run(), progress=terminal.Progress(), skip_fix_input=False): self.run = run self.progress = progress self.just_do_it = False if type(input_files) != type(list()): input_files = [input_files] if not skip_fix_input: input_files[0] = self.fix_input_file(input_files[0]) files_expected = {'kaiju_output': input_files[0]} files_structure = {'kaiju_output': {'col_names': ['_', 'gene_callers_id', '_', '_', '_', '_', '_', 'taxonomy'], 'col_mapping': [str, int, str, int, str, str, str, str], 'separator': '\t', 'indexing_field': -1 }, } Parser.__init__(self, 'Kaiju', input_files, files_expected, files_structure) if not skip_fix_input: os.remove(input_files[0])
def __init__(self, input_file_paths, run=terminal.Run(), progress=terminal.Progress()): self.run = run self.progress = progress self.just_do_it = False input_file_path = self.fix_input_file(input_file_paths[0]) files_expected = {'agnostos_output': input_file_path} files_structure = { 'agnostos_output': { 'col_names': [ 'gene_callers_id', 'cl_name', 'contig', 'gene_x_contig', 'cl_size', 'category', 'pfam', 'is.HQ', 'is.LS', 'lowest_rank', 'lowest_level', 'niche_breadth_sign' ], 'col_mapping': [int, str, str, str, str, str, str, str, str, str, str, str], 'indexing_field': -1, 'separator': '\t' }, } self.progress.new('Initializing the parser') self.progress.update('...') Parser.__init__(self, 'agnostos', [input_file_path], files_expected, files_structure) self.progress.end()
def __init__(self, hmmer_table_txt, alphabet='AA', context='GENE', program='hmmscan', run=terminal.Run()): self.alphabet = alphabet self.context = context self.program = program self.run = run if self.context == "GENE": col_info = self.get_col_info_for_GENE_context() elif self.context == "CONTIG" and (self.alphabet == "DNA" or self.alphabet == "RNA"): col_info = self.get_col_info_for_CONTIG_context() elif self.context == "DOMAIN" and self.alphabet == "AA": if program != 'hmmsearch': raise ConfigError( "HMMScan :: the 'DOMAIN' context is only available for hmmsearch." ) col_info = self.get_col_info_for_DOMAIN_context() else: raise ConfigError( "HMMScan driver is confused. Yor context and alphabet pair ('%s' and '%s') " "does not seem to be implemented in the parser module. If you think this is " "not a mistake on your part, please get in touch with the anvi'o developers " "and watch them fix it like actual pros." % (self.context, self.alphabet)) col_names, col_mapping = col_info files_structure = { 'hits': { 'col_names': col_names, 'col_mapping': col_mapping, 'indexing_field': -1, 'no_header': True, }, } ## Here we have some sad extra parsing that will hopefully go away once HMMER4 comes out. Due to stupid ## space-delimited and column-aligned HMMER output with a description field that can contain internal spaces, ## when anvio.driver.HMMER converts spaces into to tabs we can end up with lines of variable columns. Which ## means that before we can send this file to the base parser, we have to combine the split description fields ## into one column. Yeah, it sucks doing it this way. But we tried asking the HMMER people to just give us tab- ## delimited output already (https://github.com/EddyRivasLab/hmmer/issues/235) and it seems that we'll have to ## wait for HMMER4 to have this feature. So it has to be this way until HMMER 4 comes out. ## ## We could just parse everything with pandas right here, but the base parser is already set up nicely to do it, ## and it will be easy to switch back to using that parser directly once the promised tab-delimited output is ## implemented. So for now we just make a slight detour to fix the shitty format before going back to the base ## parser. fixed_hmmer_table_txt = self.fix_sad_hmmer_table_output( hmmer_table_txt, col_names) files_expected = {'hits': fixed_hmmer_table_txt} Parser.__init__(self, self.program, [fixed_hmmer_table_txt], files_expected, files_structure)
def __init__(self, input_file_paths, taxonomy_table_structure, run=terminal.Run(), progress=terminal.Progress()): self.run = run self.progress = progress matrix_txt = input_file_paths[0] files_expected = {'matrix': matrix_txt} files_structure = { 'matrix': { 'col_names': [ 'gene_callers_id', 't_phylum', 't_class', 't_order', 't_family', 't_genus', 't_species' ], 'col_mapping': [int, str, str, str, str, str, str], 'only_expected_fields': True, } } self.taxonomy_table_structure = taxonomy_table_structure Parser.__init__(self, 'DefaultMatrix', [matrix_txt], files_expected, files_structure)
def __init__(self, hmm_scan_hits_txt, alphabet='AA', context='GENE'): self.alphabet = alphabet self.context = context files_expected = {'hits': hmm_scan_hits_txt} if self.context == "GENE": # see the HMMER user guide for details of the fields for AA sequence search, and DNA sequence search. col_names = ['gene_name', 'gene_hmm_id', 'gene_callers_id', 'f', 'e_value', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f'] col_mapping = [str, str, int, str, float, str, str, str, str, str, str, str, str, str, str, str, str, str] elif self.context == "CONTIG" and (self.alphabet == "DNA" or self.alphabet == "RNA"): # 'hmm_target', 'hmm_acc', 'query_id', 'query_acc', 'hmm_from', 'hmm_to', 'alignment_from', 'alignment_to', 'envelope_from', 'envelope_to', 'seq_len', 'strand', 'e_value', 'score', 'bias', 'desc'] col_names = ['gene_name', 'gene_hmm_id', 'contig_name', 'f', 'hmm_from', 'hmm_to', 'alignment_from', 'alignment_to', 'envelope_from', 'envelope_to', 'f', 'f', 'e_value', 'f', 'f', 'f'] col_mapping = [str, str, str, str, str, str, int, int, int, int, str, str, float, str, str, str] else: raise ConfigError("HMMScan driver is confused. Yor context and alphaet pair ('%s' and '%s')\ does not seem to be implemented in the parser module. If you think this is\ not a mistake on your part, please get in touch with the anvi'o developers\ and watch them fix it like actual pros." % (self.context, self.alphabet)) files_structure = {'hits': {'col_names': col_names, 'col_mapping': col_mapping, 'indexing_field': -1, 'no_header': True }, } Parser.__init__(self, 'HMMScan', [hmm_scan_hits_txt], files_expected, files_structure)
def __init__(self, input_file_paths, splits_taxonomy_table_structure): files_expected = {'svr_output': 'svr_assign_to_dna_using_figfams.txt'} files_structure = {'svr_output': {'col_names': ['contig', 'field1', 'prot', 'function', 't_species'], 'col_mapping': [str, int, str, str, str], 'indexing_field': 2}} self.splits_taxonomy_table_structure = splits_taxonomy_table_structure Parser.__init__(self, 'MyRastCMDLine', input_file_paths, files_expected, files_structure)
def __init__(self, input_file_paths, splits_taxonomy_table_structure): matrix_txt = input_file_paths[0] files_expected = {'matrix': matrix_txt} files_structure = {'matrix': {'col_names': ['prot', 'contig', 'start', 'stop', 'direction', 'figfam', 'function', 't_phylum', 't_class', 't_order', 't_family', 't_genus', 't_species'], 'col_mapping': [str, str, int, int, str, str, str, str, str, str, str, str, str], } } self.splits_taxonomy_table_structure = splits_taxonomy_table_structure Parser.__init__(self, 'DefaultMatrix', [matrix_txt], files_expected, files_structure)
def __init__(self, input_file_paths, genes_table_structure): files_expected = {'functions': 'svr_assign_using_figfams.txt', 'genes': 'svr_call_pegs.txt'} files_structure = {'functions': {'col_names': ['t_species', 'field2', 'prot', 'function'], 'col_mapping': [str, int, str, str], 'indexing_field': 2}, 'genes': {'type': 'fasta'},} self.genes_table_structure = genes_table_structure Parser.__init__(self, 'MyRastCMDLine', input_file_paths, files_expected, files_structure)
def __init__(self, input_file_paths, genes_table_structure): matrix_txt = input_file_paths[0] files_expected = {'matrix': matrix_txt} files_structure = {'matrix': {'col_names': ['prot', 'contig', 'start', 'stop', 'direction', 'figfam', 'function', 't_phylum', 't_class', 't_order', 't_family', 't_genus', 't_species'], 'col_mapping': [str, str, int, int, str, str, str, str, str, str, str, str, str], } } self.genes_table_structure = genes_table_structure Parser.__init__(self, 'DefaultMatrix', [matrix_txt], files_expected, files_structure)
def __init__(self, hmm_scan_hits_txt): files_expected = {'hits': hmm_scan_hits_txt} files_structure = {'hits': {'col_names': ['gene_name', 'gene_hmm_id', 'gene_callers_id', 'f', 'e_value', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f','f', 'f','f'], 'col_mapping': [str, str, int, str, float, str, str, str, str, str, str, str, str, str, str, str, str, str], 'indexing_field': -1, 'no_header': True }, } Parser.__init__(self, 'HMMScan', [hmm_scan_hits_txt], files_expected, files_structure)
def __init__(self, input_file_paths, taxonomy_table_structure): files_expected = {'functions': 'svr_assign_using_figfams.txt', 'genes': 'svr_call_pegs.txt'} files_structure = {'functions': {'col_names': ['t_species', 'field2', 'prot', 'function'], 'col_mapping': [str, int, str, str], 'indexing_field': 2}, 'genes': {'type': 'fasta'},} self.taxonomy_table_structure = taxonomy_table_structure Parser.__init__(self, 'MyRastCMDLine', input_file_paths, files_expected, files_structure)
def __init__(self, input_file_paths): input_file_path = input_file_paths[0] files_expected = {'matrix': input_file_path} files_structure = {'matrix': {'col_names': ['gene_callers_id', 'hash', 'length', 'source', 'accession', 'function', 'start', 'stop', 'e_value', 'status', 'date'], 'col_mapping': [int, str, int, str, str, str, int, int, str, str, str], 'indexing_field': -1, 'no_header': True}, } Parser.__init__(self, 'InterProScan', input_file_paths, files_expected, files_structure)
def __init__(self, hmm_scan_hits_txt): files_expected = {'hits': hmm_scan_hits_txt} files_structure = {'hits': {'col_names': ['gene_name', 'gene_hmm_id', 'gene_callers_id', 'f', 'e_value', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f'], 'col_mapping': [str, str, int, str, float, str, str, str, str, str, str, str, str, str, str, str, str, str], 'indexing_field': -1, 'no_header': True }, } Parser.__init__(self, 'HMMScan', [hmm_scan_hits_txt], files_expected, files_structure)
def __init__(self, proteins_in_contigs_fasta, hmm_scan_hits_txt): files_expected = {'proteins': proteins_in_contigs_fasta, 'hits': hmm_scan_hits_txt} files_structure = {'hits': {'col_names': ['gene_name', 'gene_id', 'query_name', 'f', 'e_value', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f','f', 'f','f'], 'col_mapping': [str, str, str, str, float, str, str, str, str, str, str, str, str, str, str, str, str, str], 'indexing_field': -1, 'no_header': True }, 'proteins': {'type': 'fasta'},} Parser.__init__(self, 'HMMScan', [proteins_in_contigs_fasta, hmm_scan_hits_txt], files_expected, files_structure)
def __init__(self, input_files, contigs = 'False'): if type(input_files) != type(list()): input_files = [input_files] files_expected = {'clusters': input_files[0]} files_structure = {'clusters': {'col_names': ['split', 'cluster_id'], 'col_mapping': [str, str], 'separator': ',', 'indexing_field': -1, 'no_header': True }, } Parser.__init__(self, 'CONCOCT', input_files, files_expected, files_structure)
def __init__(self, input_file_paths, genes_table_structure): files_expected = {'functions': 'functions.tbl', 'gene_otus': 'gene_otus.tbl', 'peg': 'peg.tbl'} files_structure = {'functions': {'col_names': ['prot', 'figfam', 'field3', 'field4', 'field5', 'function'], 'col_mapping': [str, str, int, int, int, str]}, 'gene_otus': {'col_names': ['prot', 't_species'], 'col_mapping': None}, 'peg': {'col_names': ['prot', 'contig', 'start', 'stop'], 'col_mapping': [str, str, int, int]},} self.genes_table_structure = genes_table_structure Parser.__init__(self, 'MyRastGUI', input_file_paths, files_expected, files_structure)
def __init__(self, input_file_paths, splits_taxonomy_table_structure): files_expected = {'svr_output': 'svr_assign_to_dna_using_figfams.txt'} files_structure = { 'svr_output': { 'col_names': ['contig', 'field1', 'prot', 'function', 't_species'], 'col_mapping': [str, int, str, str, str], 'indexing_field': 2 } } self.splits_taxonomy_table_structure = splits_taxonomy_table_structure Parser.__init__(self, 'MyRastCMDLine', input_file_paths, files_expected, files_structure)
def __init__(self, input_files, contigs = 'False'): if type(input_files) != type(list()): input_files = [input_files] files_expected = {'clusters': input_files[0]} files_structure = {'clusters': {'col_names': ['split', 'bin_name'], 'col_mapping': [str, str], 'separator': ',', 'indexing_field': -1, 'no_header': True }, } Parser.__init__(self, 'CONCOCT', input_files, files_expected, files_structure)
def __init__(self, input_file_paths, taxonomy_table_structure, run=terminal.Run(), progress=terminal.Progress()): self.run = run self.progress = progress matrix_txt = input_file_paths[0] files_expected = {'matrix': matrix_txt} files_structure = {'matrix': {'col_names': ['gene_callers_id'] + levels_of_taxonomy, 'col_mapping': [int] + [str] * len(levels_of_taxonomy), 'only_expected_fields': True, } } self.taxonomy_table_structure = taxonomy_table_structure Parser.__init__(self, 'DefaultMatrix', [matrix_txt], files_expected, files_structure)
def __init__(self, input_file_paths, taxonomy_table_structure, run=terminal.Run(), progress=terminal.Progress()): self.run = run self.progress = progress matrix_txt = input_file_paths[0] files_expected = {'matrix': matrix_txt} files_structure = {'matrix': {'col_names': ['gene_callers_id', 't_phylum', 't_class', 't_order', 't_family', 't_genus', 't_species'], 'col_mapping': [int, str, str, str, str, str, str], 'only_expected_fields': True, } } self.taxonomy_table_structure = taxonomy_table_structure Parser.__init__(self, 'DefaultMatrix', [matrix_txt], files_expected, files_structure)
def __init__(self, hmmer_table_txt, alphabet='AA', context='GENE', program='hmmscan', run=terminal.Run()): self.alphabet = alphabet self.context = context self.program = program self.run = run files_expected = {'hits': hmmer_table_txt} if self.context == "GENE": col_info = self.get_col_info_for_GENE_context() elif self.context == "CONTIG" and (self.alphabet == "DNA" or self.alphabet == "RNA"): col_info = self.get_col_info_for_CONTIG_context() elif self.context == "DOMAIN" and self.alphabet == "AA": if program != 'hmmsearch': raise ConfigError( "HMMScan :: the 'DOMAIN' context is only available for hmmsearch." ) col_info = self.get_col_info_for_DOMAIN_context() else: raise ConfigError( "HMMScan driver is confused. Yor context and alphabet pair ('%s' and '%s') " "does not seem to be implemented in the parser module. If you think this is " "not a mistake on your part, please get in touch with the anvi'o developers " "and watch them fix it like actual pros." % (self.context, self.alphabet)) col_names, col_mapping = col_info files_structure = { 'hits': { 'col_names': col_names, 'col_mapping': col_mapping, 'indexing_field': -1, 'no_header': True, }, } Parser.__init__(self, 'HMMScan', [hmmer_table_txt], files_expected, files_structure)
def __init__(self, input_files, taxonomy_table_structure, run=terminal.Run(), progress=terminal.Progress()): self.run = run self.progress = progress if type(input_files) != type(list()): input_files = [input_files] files_expected = {'kraken_output': input_files[0]} files_structure = {'kraken_output': {'col_names': ['taxonomy', 'count'], 'col_mapping': [str, int], 'separator': '\t', 'indexing_field': -1 }, } Parser.__init__(self, 'KrakenHLL', input_files, files_expected, files_structure)
def __init__(self, input_file_paths, taxonomy_table_structure, run=terminal.Run(), progress=terminal.Progress()): self.run = run self.progress = progress self.min_hit_score = 250 files_expected = { 'report': input_file_paths[0] if len(input_file_paths) > 0 else 'centrifuge_report.tsv', 'hits': input_file_paths[1] if len(input_file_paths) > 1 else 'centrifuge_hits.tsv' } files_structure = { 'report': { 'col_names': ['t_species', 'taxon_id', 'f1', 'f2', 'f3', 'f4', 'f5'], 'col_mapping': [str, int, str, str, str, str, str], 'indexing_field': 1 }, 'hits': { 'col_names': [ 'gene_callers_id', 'f1', 'taxon_id', 'score', 'f2', 'f3', 'f4', 'f5' ], 'col_mapping': [ lambda x: int(x.split('|')[0]), str, int, int, str, str, str, str ], 'indexing_field': -1 }, } self.taxonomy_table_structure = taxonomy_table_structure Parser.__init__(self, 'centrifuge', input_file_paths, files_expected, files_structure)
def __init__(self, hmm_scan_hits_txt, alphabet='AA', context='GENE', program='hmmscan'): self.alphabet = alphabet self.context = context self.program = program self.run = run files_expected = {'hits': hmm_scan_hits_txt} if self.context == "GENE": if self.program == 'hmmscan': # see the HMMER user guide for details of the fields for AA sequence search, and DNA sequence search. # --- full sequence ---- --- best 1 domain ---- --- domain number estimation ---- # target name accession query name accession E-value score bias E-value score bias exp reg clu ov env dom rep inc description col_names = ['gene_name', 'gene_hmm_id', 'gene_callers_id', 'f', 'e_value', 'bit_score', 'f', 'f', 'dom_bit_score', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f'] col_mapping = [str, str, int, str, float, float, str, str, float, str, str, str, str, str, str, str, str, str] elif self.program == 'hmmsearch': # --- full sequence ---- --- best 1 domain ---- --- domain number estimation ---- # target name accession query name accession E-value score bias E-value score bias exp reg clu ov env dom rep inc description of target #------------------- ---------- -------------------- ---------- --------- ------ ----- --------- ------ ----- --- --- --- --- --- --- --- --- --------------------- col_names = ['gene_callers_id', 'f', 'gene_name', 'gene_hmm_id', 'e_value', 'bit_score', 'f', 'f', 'dom_bit_score', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f'] col_mapping = [int, str, str, str, float, float, str, str, float, str, str, str, str, str, str, str, str, str] else: raise ConfigError("The HMMScan Parser class is not sure if you know what you are doing. You told it that you wanted to " "parse HMM hits from the program %s, but this class doesn't know how to handle those." % (self.program)) elif self.context == "CONTIG" and (self.alphabet == "DNA" or self.alphabet == "RNA"): # 'hmm_target', 'hmm_acc', 'query_id', 'query_acc', 'hmm_from', 'hmm_to', 'alignment_from', 'alignment_to', 'envelope_from', 'envelope_to', 'seq_len', 'strand', 'e_value', 'score', 'bias', 'desc'] col_names = ['gene_name', 'gene_hmm_id', 'contig_name', 'f', 'hmm_from', 'hmm_to', 'alignment_from', 'alignment_to', 'envelope_from', 'envelope_to', 'f', 'f', 'e_value', 'f', 'f', 'f'] col_mapping = [str, str, str, str, str, str, int, int, int, int, str, str, float, str, str, str] else: raise ConfigError("HMMScan driver is confused. Yor context and alphaet pair ('%s' and '%s') " "does not seem to be implemented in the parser module. If you think this is " "not a mistake on your part, please get in touch with the anvi'o developers " "and watch them fix it like actual pros." % (self.context, self.alphabet)) files_structure = {'hits': {'col_names': col_names, 'col_mapping': col_mapping, 'indexing_field': -1, 'no_header': True }, } Parser.__init__(self, 'HMMScan', [hmm_scan_hits_txt], files_expected, files_structure)
def __init__(self, input_file_paths, taxonomy_table_structure, run=terminal.Run(), progress=terminal.Progress()): self.run = run self.progress = progress self.min_hit_score = 250 files_expected = {'report': 'centrifuge_report.tsv', 'hits': 'centrifuge_hits.tsv'} files_structure = {'report': {'col_names': ['t_species', 'taxon_id', 'f1', 'f2', 'f3', 'f4', 'f5'], 'col_mapping': [str, int, str, str, str, str, str], 'indexing_field': 1}, 'hits': {'col_names': ['gene_callers_id', 'f1', 'taxon_id', 'score', 'f2', 'f3', 'f4', 'f5'], 'col_mapping': [lambda x: int(x.split('|')[0]), str, int, int, str, str, str, str], 'indexing_field': -1}, } self.taxonomy_table_structure = taxonomy_table_structure Parser.__init__(self, 'centrifuge', input_file_paths, files_expected, files_structure)