def make_halo(search_distances, scan_distances, ratios=None): """returns the organism object to work on""" keggfile = util.read_dfile(KEGG_FILE_PATH, comment='#') gofile = util.read_dfile(GO_FILE_PATH) rsatdb = rsat.RsatDatabase(RSAT_BASE_URL, CACHE_DIR, 'Halobacterium sp', 64091) mo_db = microbes_online.MicrobesOnline(CACHE_DIR) stringfile = 'testdata/string_links_64091.tab' nw_factories = [] if stringfile != None: nw_factories.append(stringdb.get_network_factory('hal', stringfile, 0.5)) else: logging.warn("no STRING file specified !") if ratios is not None: nw_factories.append(microbes_online.get_network_factory( mo_db, max_operon_size=ratios.num_rows / 20, weight=0.5)) keggorg = util.make_dfile_map(keggfile, 1, 3)['hal'] rsat_organism = rsatdb.get_rsat_organism(keggorg) rsat_info = org.RsatSpeciesInfo(rsatdb, keggorg, rsat_organism, 64091) gotax = util.make_dfile_map(gofile, 0, 1)[rsat_info.go_species()] return org.Microbe('hal', keggorg, rsat_info, gotax, mo_db, nw_factories, search_distances, scan_distances, True, None)
def make_halo(ratio_matrix, search_distances, scan_distances): """returns the organism object to work on""" keggfile = util.read_dfile(KEGG_FILE_PATH, comment='#') gofile = util.read_dfile(GO_FILE_PATH) rsatdb = rsat.RsatDatabase(rsat.RSAT_BASE_URL, CACHE_DIR) mo_db = microbes_online.MicrobesOnline(CACHE_DIR) stringfile = 'testdata/string_links_64091.tab' nw_factories = [] if stringfile != None: nw_factories.append( stringdb.get_network_factory2('hal', stringfile, 0.5, normalized=True)) else: logging.warn("no STRING file specified !") nw_factories.append( microbes_online.get_network_factory( mo_db, max_operon_size=ratio_matrix.num_rows / 20, weight=0.5)) org_factory = org.MicrobeFactory(org.make_kegg_code_mapper(keggfile), org.make_rsat_organism_mapper(rsatdb), org.make_go_taxonomy_mapper(gofile), mo_db, nw_factories) return org_factory.create('hal', search_distances, scan_distances)
def make_halo(search_distances, scan_distances): """returns the organism object to work on""" keggfile = util.read_dfile(KEGG_FILE_PATH, comment='#') gofile = util.read_dfile(GO_FILE_PATH) rsatdb = rsat.RsatDatabase(RSAT_BASE_URL, CACHE_DIR ) mo_db = microbes_online.MicrobesOnline() org_factory = org.MicrobeFactory(org.make_kegg_code_mapper(keggfile), org.make_rsat_organism_mapper(rsatdb), org.make_go_taxonomy_mapper(gofile), mo_db, []) return org_factory.create('hal', search_distances, scan_distances)
def make_halo(search_distances, scan_distances): """returns the organism object to work on""" keggfile = util.read_dfile(KEGG_FILE_PATH, comment='#') gofile = util.read_dfile(GO_FILE_PATH) rsatdb = rsat.RsatDatabase(rsat.RSAT_BASE_URL, CACHE_DIR) mo_db = microbes_online.MicrobesOnline(CACHE_DIR) org_factory = org.MicrobeFactory(org.make_kegg_code_mapper(keggfile), org.make_rsat_organism_mapper(rsatdb), org.make_go_taxonomy_mapper(gofile), mo_db, []) return org_factory.create('hal', search_distances, scan_distances)
def __get_kegg_data(self): # determine the NCBI code organism_code = self['organism_code'] if os.path.exists(USER_KEGG_FILE_PATH): keggfile = util.read_dfile(USER_KEGG_FILE_PATH, comment='#') elif os.path.exists(SYSTEM_KEGG_FILE_PATH): keggfile = util.read_dfile(SYSTEM_KEGG_FILE_PATH, comment='#') else: raise Exception('KEGG file not found !!') kegg_map = util.make_dfile_map(keggfile, 1, 3) kegg2ncbi = util.make_dfile_map(keggfile, 1, 2) if self['ncbi_code'] is None and organism_code in kegg2ncbi: self['ncbi_code'] = kegg2ncbi[organism_code] return self['ncbi_code'], kegg_map[organism_code]
def __make_organism(self): """makes a mock organism with almost real data""" features = {} dfile = util.read_dfile('testdata/Halobacterium_sp_features', comment='--') for line in dfile.lines: features[line[0]] = st.Feature( line[0], line[1], line[2], st.Location(line[3], int(line[4]), int(line[5]), line[6] == 'R')) tfile = util.read_dfile('testdata/Halobacterium_sp_feature_names', comment='--') synonyms = th.create_from_rsat_feature_names(tfile) return MockOrganismWithSynonyms('64091', features, synonyms)
def __make_organism(self): """makes a mock organism with almost real data""" features = {} dfile = util.read_dfile('testdata/Halobacterium_sp_features', comment='--') for line in dfile.lines: features[line[0]] = st.Feature(line[0], line[1], line[2], st.Location(line[3], int(line[4]), int(line[5]), line[6] == 'R')) tfile = util.read_dfile( 'testdata/Halobacterium_sp_feature_names', comment='--') synonyms = th.create_from_rsat_feature_names(tfile) return MockOrganismWithSynonyms('64091', features, synonyms)
def test_motif_scoring(self): """tests the motif scoring in integration""" search_distances = {'upstream': (-20, 150)} scan_distances = {'upstream': (-30, 250)} matrix_factory = dm.DataMatrixFactory([dm.nochange_filter, dm.center_scale_filter]) infile = util.read_dfile('example_data/hal/halo_ratios5.tsv', has_header=True, quote='\"') ratio_matrix = matrix_factory.create_from(infile) organism = testutil.make_halo(search_distances, scan_distances, ratio_matrix) membership = FakeMembership() config_params = {'memb.min_cluster_rows_allowed': 3, 'memb.max_cluster_rows_allowed': 70, 'multiprocessing': False, 'num_clusters': 1, 'output_dir': 'out', 'debug': {}, 'search_distances': {'upstream': (-20, 150)}, 'num_iterations': 2000, 'MEME': {'schedule': lambda i: True, 'version': '4.3.0', 'global_background': False, 'arg_mod': 'zoops', 'nmotifs_rvec': 'c(rep(1, num_iterations/3), rep(2, num_iterations/3))', 'use_revcomp': 'True', 'max_width': 24, 'background_order': 3}, 'Motifs': {'schedule': lambda i: True, 'scaling': ('scaling_const', 1.0)}} func = motif.MemeScoringFunction(organism, membership, ratio_matrix, config_params=config_params) iteration_result = { 'iteration': 100 } matrix = func.compute(iteration_result)
def read_edges2(filename, organism, ratios): """just read a preprocessed file, much faster to debug""" logging.info("stringdb.read_edges2()") dfile = util.read_dfile(filename, sep) result = [] max_score = 0.0 thesaurus = organism.thesaurus() if ratios: cano_genes = {thesaurus[row] for row in ratios.row_names if row in thesaurus} else: cano_genes = None num_ignored = 0 for line in dfile.lines: node1 = patches.patch_string_gene(organism_code, line[0]) node2 = patches.patch_string_gene(organism_code, line[1]) score = float(line[2]) max_score = max(score, max_score) if can_add_edge(node1, node2, thesaurus, cano_genes): result.append((intern(node1), intern(node2), score)) else: num_ignored += 1 if not normalized: result = normalize_edges_to_max_score(result, max_score) logging.info("stringdb.read_edges2(), %d edges read, %d edges ignored", len(result), num_ignored) return result
def prepare_ensemble_matrix(ratiofile, outdir, n, kmin): matrix_factory = DataMatrixFactory([nochange_filter, center_scale_filter]) if os.path.exists(ratiofile): infile = util.read_dfile(ratiofile, has_header=True, quote='\"') matrix = matrix_factory.create_from(infile) split_matrix(matrix, outdir, n, kmin, matrix.num_columns)
def setUp(self): # pylint; disable-msg=C0103 """test fixture""" self.search_distances = {'upstream': (-20, 150)} self.scan_distances = {'upstream': (-30, 250)} matrix_factory = dm.DataMatrixFactory( [dm.nochange_filter, dm.center_scale_filter]) infile = util.read_dfile('example_data/hal/halo_ratios5.tsv', has_header=True, quote='\"') self.ratio_matrix = matrix_factory.create_from(infile) self.organism = make_halo(self.ratio_matrix, self.search_distances, self.scan_distances) self.config_params = { 'memb.min_cluster_rows_allowed': 3, 'memb.max_cluster_rows_allowed': 70, 'multiprocessing': False, 'memb.clusters_per_row': 2, 'memb.clusters_per_col': int(round(43 * 2.0 / 3.0)), 'num_clusters': 43, 'output_dir': 'out', 'remap_network_nodes': False, 'num_iterations': 2000, 'debug': False } self.membership = self.__read_members() # relies on config_params self.iteration_result = {'iteration': 51}
def test_motif_scoring(self): """tests the motif scoring in integration""" search_distances = {'upstream': (-20, 150)} scan_distances = {'upstream': (-30, 250)} matrix_factory = dm.DataMatrixFactory([dm.nochange_filter, dm.center_scale_filter]) infile = util.read_dfile('halo_ratios5.tsv', has_header=True, quote='\"') ratio_matrix = matrix_factory.create_from(infile) meme_suite = meme.MemeSuite430(remove_tempfiles=True) sequence_filters = [ motif.unique_filter, motif.get_remove_low_complexity_filter(meme_suite), motif.get_remove_atgs_filter(search_distances['upstream'])] organism = make_halo(ratio_matrix, search_distances, scan_distances) membership = FakeMembership() config_params = {'memb.min_cluster_rows_allowed': 3, 'memb.max_cluster_rows_allowed': 70, 'multiprocessing': False, 'num_clusters': 1, 'output_dir': 'out', 'num_iterations': 2000} func = motif.MemeScoringFunction(organism, membership, ratio_matrix, meme_suite, sequence_filters=sequence_filters, scaling_func=lambda iter: 1.0, num_motif_func=motif.default_nmotif_fun, config_params=config_params) iteration_result = { 'iteration': 100 } matrix = func.compute(iteration_result) """
def test_read_with_quotes(self): """Reads a semicolon delimited file with quotes""" dfile = util.read_dfile("testdata/withquotes.ssv", sep=';', has_header=False, comment='#', quote='"') lines = dfile.lines self.assertEquals(["value11", "value12"], lines[0]) self.assertEquals(["value21", "value22"], lines[1])
def test_motif_scoring(self): """tests the motif scoring in integration""" search_distances = {'upstream': (-20, 150)} scan_distances = {'upstream': (-30, 250)} matrix_factory = dm.DataMatrixFactory([dm.nochange_filter, dm.center_scale_filter]) infile = util.read_dfile('example_data/hal/halo_ratios5.tsv', has_header=True, quote='\"') ratio_matrix = matrix_factory.create_from(infile) meme_suite = meme.MemeSuite430(remove_tempfiles=True) sequence_filters = [ motif.unique_filter, motif.get_remove_low_complexity_filter(meme_suite), motif.get_remove_atgs_filter(search_distances['upstream'])] organism = make_halo(ratio_matrix, search_distances, scan_distances) membership = FakeMembership() config_params = {'memb.min_cluster_rows_allowed': 3, 'memb.max_cluster_rows_allowed': 70, 'multiprocessing': False, 'num_clusters': 1, 'output_dir': 'out', 'debug': False, 'num_iterations': 2000} func = motif.MemeScoringFunction(organism, membership, ratio_matrix, meme_suite, sequence_filters=sequence_filters, scaling_func=lambda iter: 1.0, num_motif_func=lambda iter: 1, update_in_iteration=lambda x: True, motif_in_iteration=lambda x: True, config_params=config_params) iteration_result = { 'iteration': 100 } matrix = func.compute(iteration_result) """
def __sequences_for_genes(self, seqtype, genes, distance): """retrieves the specified sequences from the supplied genomic data""" if not seqtype in self.__seqs: logging.info('loading %s sequences' % seqtype) dfile = util.read_dfile(self.__seq_filenames[seqtype], sep=',') self.__seqs[seqtype] = {} for line in dfile.lines: self.__seqs[seqtype][line[0].upper()] = line[1].upper() logging.info('loaded %i %s sequences' % (len(self.__seqs[seqtype]), seqtype)) result = {} for alias in genes: if alias in self.thesaurus(): gene = self.thesaurus()[alias] if gene in self.__seqs[seqtype]: # note that we have to return the sequence as a (location, sequence) # pair even if we do not actually use the Location result[gene] = (st.Location(gene, 0, 0, False), self.__seqs[seqtype][gene]) else: #logging.warn("Gene '%s' not found in 3' UTRs", gene) pass else: #logging.warn("Alias '%s' not in thesaurus !", alias) pass return result
def test_read_with_tabs(self): """Reads a tab delimited file""" dfile = util.read_dfile("testdata/simple.tsv") lines = dfile.lines self.assertEquals(["value11", "value12"], lines[0]) self.assertEquals(["value21", "value22"], lines[1]) self.assertIsNone(dfile.header)
def test_read_with_semicolon_header_and_comments(self): """Reads a semicolon delimited file with a header and comments""" dfile = util.read_dfile("testdata/withcomments.ssv", sep=';', has_header=True, comment='#') lines = dfile.lines self.assertEquals(2, len(lines)) self.assertEquals(["header1", "header2"], dfile.header)
def read_edges2(filename): """just read a preprocessed file, much faster to debug""" logging.info("\x1b[31mstringdb:\t\x1b[0mreading interaction network - stringdb.read_edges2()") dfile = util.read_dfile(filename, sep) result = [] for line in dfile.lines: result.append((line[0], line[1], float(line[2]))) return result
def __make_ref_operon_pairs(self): """returns reference operon pairs for comparison""" reffile = util.read_dfile('testdata/operon_reftable.tsv', has_header=True, quote='"') refpairs = [] for line in reffile.lines: refpairs.append((line[1], line[2])) return refpairs
def read_edges3(filename): """just read a preprocessed file, much faster to debug""" logging.info("stringdb.read_edges3()") dfile = util.read_dfile(filename, sep=",", has_header=True, quote='"') result = [] for line in dfile.lines: result.append([line[1], line[2], float(line[3])]) return result
def read_csv(cls, name, infile, cutoff=None, sep=','): """reads a set from a CSV file""" dfile = util.read_dfile(infile, sep) sets = {} for line in dfile.lines: if line[0] not in sets: sets[line[0]] = EnrichmentSet('discrete') sets[line[0]].add(line[1].upper(), 1) return SetType(name, sets)
def test_read_with_empty_lines(self): """Reads a semicolon delimited file containing emptylines""" dfile = util.read_dfile("testdata/withemptylines.ssv", sep=';', has_header=True, comment='#', quote='"') lines = dfile.lines self.assertEquals(["header1", "header2"], dfile.header) self.assertEquals(2, len(lines)) self.assertEquals(["value11", "value12"], lines[0]) self.assertEquals(["value21", "value22"], lines[1])
def read_edges2(filename): """just read a preprocessed file, much faster to debug""" logging.info("stringdb.read_edges2()") dfile = util.read_dfile(filename, sep) result = [] for line in dfile.lines: result.append([patches.patch_string_gene(organism_code, line[0]), patches.patch_string_gene(organism_code, line[1]), float(line[2])]) return result
def make_microbe(self): """returns the organism object to work on""" keggfile = util.read_dfile(KEGG_FILE_PATH, comment='#') gofile = util.read_dfile(GO_FILE_PATH) rsatdb = rsat.RsatDatabase(RSAT_BASE_URL, self['cache_dir']) mo_db = microbes_online.MicrobesOnline() stringfile = self.config_params['string_file'] kegg_mapper = org.make_kegg_code_mapper(keggfile) rsat_mapper = org.make_rsat_organism_mapper(rsatdb) # automatically download STRING file if stringfile == None: rsat_info = rsat_mapper(kegg_mapper(self['organism_code'])) ncbi_code = rsat_info.taxonomy_id print "NCBI CODE IS: ", ncbi_code url = STRING_URL_PATTERN % ncbi_code stringfile = "%s/%s.gz" % (self['cache_dir'], ncbi_code) self['string_file'] = stringfile logging.info("Automatically using STRING file in '%s'", stringfile) util.get_url_cached(url, stringfile) nw_factories = [] if stringfile != None: nw_factories.append(stringdb.get_network_factory2( self['organism_code'], stringfile, 0.5)) else: logging.warn("no STRING file specified !") nw_factories.append(microbes_online.get_network_factory( mo_db, max_operon_size=self.ratio_matrix.num_rows / 20, weight=0.5)) org_factory = org.MicrobeFactory(kegg_mapper, rsat_mapper, org.make_go_taxonomy_mapper(gofile), mo_db, nw_factories) return org_factory.create(self['organism_code'], self['search_distances'], self['scan_distances'])
def create_from_delimited_file2(dfile): """creates a thesaurus from a delimited file where the format is <original>SEPARATOR<alt1>;<alt2>;... ...""" if isinstance(dfile, str): dfile = util.read_dfile(dfile, sep=',', has_header=False) result = {} for line in dfile.lines: original = line[0].upper() # original should map to itself result[original] = original for alternative in line[1].split(';'): result[alternative.upper()] = original return result
def read_edges2(filename): """just read a preprocessed file, much faster to debug""" logging.info("stringdb.read_edges2()") dfile = util.read_dfile(filename, sep) result = [] max_score = 0.0 for line in dfile.lines: score = float(line[2]) max_score = max(score, max_score) result.append((patches.patch_string_gene(organism_code, line[0]), patches.patch_string_gene(organism_code, line[1]), score)) if not normalized: normalize_edges_to_max_score(result, max_score) return result
def make_halo(ratio_matrix, search_distances, scan_distances): """returns the organism object to work on""" keggfile = util.read_dfile(KEGG_FILE_PATH, comment='#') gofile = util.read_dfile(GO_FILE_PATH) rsatdb = rsat.RsatDatabase(RSAT_BASE_URL, CACHE_DIR ) mo_db = microbes_online.MicrobesOnline() stringfile = 'string_links_64091.tab' nw_factories = [] if stringfile != None: nw_factories.append(stringdb.get_network_factory2('hal', stringfile, 0.5)) else: logging.warn("no STRING file specified !") nw_factories.append(microbes_online.get_network_factory( mo_db, max_operon_size=ratio_matrix.num_rows / 20, weight=0.5)) org_factory = org.MicrobeFactory(org.make_kegg_code_mapper(keggfile), org.make_rsat_organism_mapper(rsatdb), org.make_go_taxonomy_mapper(gofile), mo_db, nw_factories) return org_factory.create('hal', search_distances, scan_distances)
def create_from_delimited_file2(dfile, case_sensitive): """creates a thesaurus from a delimited file where the format is <original>SEPARATOR<alt1>;<alt2>;... ...""" def fix_case(s): return s if case_sensitive else s.upper() if isinstance(dfile, str): dfile = util.read_dfile(dfile, sep=',', has_header=False) result = {} for line in dfile.lines: original = intern(fix_case(line[0])) # original should map to itself result[original] = original for alternative in line[1].split(';'): result[intern(fix_case(alternative))] = original return result
def test_motif_scoring(self): """tests the motif scoring in integration""" search_distances = {'upstream': (-20, 150)} scan_distances = {'upstream': (-30, 250)} matrix_factory = dm.DataMatrixFactory( [dm.nochange_filter, dm.center_scale_filter]) infile = util.read_dfile('example_data/hal/halo_ratios5.tsv', has_header=True, quote='\"') ratio_matrix = matrix_factory.create_from(infile) organism = testutil.make_halo(search_distances, scan_distances, ratio_matrix) membership = FakeMembership() config_params = { 'memb.min_cluster_rows_allowed': 3, 'memb.max_cluster_rows_allowed': 70, 'multiprocessing': False, 'num_clusters': 1, 'output_dir': 'out', 'debug': {}, 'search_distances': { 'upstream': (-20, 150) }, 'num_iterations': 2000, 'MEME': { 'schedule': lambda i: True, 'version': '4.3.0', 'global_background': False, 'arg_mod': 'zoops', 'nmotifs_rvec': 'c(rep(1, num_iterations/3), rep(2, num_iterations/3))', 'use_revcomp': 'True', 'max_width': 24, 'background_order': 3 }, 'Motifs': { 'schedule': lambda i: True, 'scaling': ('scaling_const', 1.0) } } func = motif.MemeScoringFunction(organism, membership, ratio_matrix, config_params=config_params) iteration_result = {'iteration': 100} matrix = func.compute(iteration_result)
def setUp(self): # pylint; disable-msg=C0103 """test fixture""" self.search_distances = {'upstream': (-20, 150)} self.scan_distances = {'upstream': (-30, 250)} matrix_factory = dm.DataMatrixFactory([dm.nochange_filter, dm.center_scale_filter]) infile = util.read_dfile('halo_ratios5.tsv', has_header=True, quote='\"') self.ratio_matrix = matrix_factory.create_from(infile) self.organism = make_halo(self.ratio_matrix, self.search_distances, self.scan_distances) self.config_params = {'memb.min_cluster_rows_allowed': 3, 'memb.max_cluster_rows_allowed': 70, 'multiprocessing': False, 'memb.clusters_per_row': 2, 'memb.clusters_per_col': int(round(43 * 2.0 / 3.0)), 'num_clusters': 43, 'num_iterations': 2000} self.membership = self.__read_members() # relies on config_params self.iteration_result = { 'iteration': 51 }
def read_matrix(filename): """reads the data matrix from a file""" controls = read_controls() rug = read_rug(lambda row: row[1] in RUG_PROPS) columns_to_use = list(set(rug + controls)) # pass the column filter as the first filter to the DataMatrixFactory, # so normalization will be applied to the submatrix matrix_factory = dm.DataMatrixFactory([ lambda matrix: matrix.submatrix_by_name( column_names=columns_to_use)]) infile = util.read_dfile(filename, sep=',', has_header=True, quote="\"") matrix = matrix_factory.create_from(infile) column_groups = {1: range(matrix.num_columns)} if SELECT_ROWS: select_rows = select_probes(matrix, 2000, column_groups) matrix = matrix.submatrix_by_rows(select_rows) return intensities_to_ratios(matrix, controls, column_groups)
def setUp(self): # pylint; disable-msg=C0103 """test fixture""" self.search_distances = {'upstream': (-20, 150)} self.scan_distances = {'upstream': (-30, 250)} matrix_factory = dm.DataMatrixFactory([dm.nochange_filter, dm.center_scale_filter]) infile = util.read_dfile('example_data/hal/halo_ratios5.tsv', has_header=True, quote='\"') self.ratio_matrix = matrix_factory.create_from(infile) self.organism = testutil.make_halo(self.search_distances, self.scan_distances, self.ratio_matrix) self.config_params = {'memb.min_cluster_rows_allowed': 3, 'memb.max_cluster_rows_allowed': 70, 'multiprocessing': False, 'num_cores': None, 'memb.clusters_per_row': 2, 'memb.clusters_per_col': int(round(43 * 2.0 / 3.0)), 'num_clusters': 43, 'output_dir': 'out', 'remap_network_nodes': False, 'use_BSCM': False, 'num_iterations': 2000, 'debug': {}, 'search_distances': {'upstream': (-20, 150)}, 'Columns': {'schedule': lambda i: True }, 'Rows': {'schedule': lambda i: True, 'scaling': ('scaling_const', 6.0) }, 'Motifs': {'schedule': lambda i: True, 'scaling': ('scaling_rvec', 'seq(0, 1, length=num_iterations*3/4)')}, 'MEME': {'version': '4.3.0', 'global_background': False, 'schedule': lambda i: True, 'nmotifs_rvec': 'c(rep(1, num_iterations/3), rep(2, num_iterations/3))', 'max_width': 24, 'arg_mod': 'zoops', 'background_order': 3, 'use_revcomp': 'True'}, 'Networks': {'schedule': lambda i: True, 'scaling': ('scaling_rvec', 'seq(1e-5, 0.5, length=num_iterations*3/4)')}} self.membership = self.__read_members() # relies on config_params self.iteration_result = { 'iteration': 51, 'score_means': {} }
def test_get_non_existing(self): """retrieve None for a non-existing organism""" dfile = util.read_dfile(PROT2TAXID_FILE_PATH, sep='\t', has_header=False) mapper = org.make_go_taxonomy_mapper(dfile) self.assertIsNone(mapper('does not exist'))
def test_get_existing(self): """retrieve an existing id""" dfile = util.read_dfile(PROT2TAXID_FILE_PATH, sep='\t', has_header=False) mapper = org.make_go_taxonomy_mapper(dfile) self.assertEquals('64091', mapper('Halobacterium salinarium'))
def test_get_non_existing_organism(self): """retrieve non-existing organism""" dfile = util.read_dfile(TAXONOMY_FILE_PATH, sep='\t', has_header=True, comment='#') mapper = org.make_kegg_code_mapper(dfile) self.assertIsNone(mapper('nope'))
def test_get_existing_organism(self): """retrieve existing organism""" dfile = util.read_dfile(TAXONOMY_FILE_PATH, sep='\t', has_header=True, comment='#') mapper = org.make_kegg_code_mapper(dfile) self.assertEquals('Helicobacter pylori 26695', mapper('hpy'))
def __read_colscores_refresult(self): dfile = util.read_dfile('testdata/column_scores_refresult.tsv', has_header=True, quote='"') return dm.DataMatrixFactory([]).create_from(dfile, case_sensitive=True)
def __read_ratios(self): dfile = util.read_dfile('testdata/row_scores_testratios.tsv', has_header=True) return dm.DataMatrixFactory([]).create_from(dfile, case_sensitive=True)
def make_microbe(self): """returns the organism object to work on""" self.__make_dirs_if_needed() if os.path.exists(USER_KEGG_FILE_PATH): keggfile = util.read_dfile(USER_KEGG_FILE_PATH, comment='#') elif os.path.exists(SYSTEM_KEGG_FILE_PATH): keggfile = util.read_dfile(SYSTEM_KEGG_FILE_PATH, comment='#') else: raise Exception('KEGG file not found !!') if os.path.exists(USER_GO_FILE_PATH): gofile = util.read_dfile(USER_GO_FILE_PATH) elif os.path.exists(SYSTEM_GO_FILE_PATH): gofile = util.read_dfile(SYSTEM_GO_FILE_PATH) else: raise Exception('GO file not found !!') if self['rsat_dir']: if not self['rsat_organism']: raise Exception( 'override RSAT loading: please specify --rsat_organism') logging.info("using RSAT files for '%s'", self['rsat_organism']) rsatdb = rsat.RsatFiles(self['rsat_dir'], self['rsat_organism'], self['ncbi_code']) else: rsatdb = rsat.RsatDatabase(rsat.RSAT_BASE_URL, self['cache_dir']) if self['operon_file']: logging.info("using operon file at '%s'", self['operon_file']) mo_db = microbes_online.MicrobesOnlineOperonFile( self['operon_file']) else: logging.info( "attempting automatic download of operons from Microbes Online" ) mo_db = microbes_online.MicrobesOnline(self['cache_dir']) stringfile = self['string_file'] kegg_mapper = org.make_kegg_code_mapper(keggfile) rsat_mapper = org.make_rsat_organism_mapper(rsatdb) ncbi_code = self['ncbi_code'] nw_factories = [] # do we use STRING ? if self['donetworks'] and self['use_string']: # download if not provided if stringfile is None: if ncbi_code is None: rsat_info = rsat_mapper(kegg_mapper(self['organism_code']), self['rsat_organism']) ncbi_code = rsat_info.taxonomy_id logging.info("NCBI CODE IS: %s", ncbi_code) url = STRING_URL_PATTERN % ncbi_code stringfile = "%s/%s.gz" % (self['cache_dir'], ncbi_code) self['string_file'] = stringfile logging.info("Automatically using STRING file in '%s'", stringfile) util.get_url_cached(url, stringfile) else: logging.info("Loading STRING file at '%s'", stringfile) # create and add network nw_factories.append( stringdb.get_network_factory2(self['organism_code'], stringfile, 0.5)) # do we use operons ? if self['donetworks'] and self['use_operons']: logging.info('adding operon network factory') nw_factories.append( microbes_online.get_network_factory( mo_db, max_operon_size=self.ratio_matrix.num_rows / 20, weight=0.5)) org_factory = org.MicrobeFactory(kegg_mapper, rsat_mapper, org.make_go_taxonomy_mapper(gofile), mo_db, nw_factories, self['ncbi_code']) return org_factory.create(self['organism_code'], self['search_distances'], self['scan_distances'], self['use_operons'], self['rsat_organism'], self.ratio_matrix)
def read_edges2(filename, organism, ratios): """just read a preprocessed file, much faster to debug""" logging.info("stringdb.read_edges2()") dfile = util.read_dfile(filename, sep) logging.info("Finished loading %s", filename) result = [] max_score = 0.0 thesaurus = organism.thesaurus() if ratios: gene_lut = {} for row_name in ratios.row_names: if row_name in thesaurus: gene_lut[thesaurus[row_name]] = row_name gene_lut[ row_name] = row_name #A node should always map to itself cano_genes = gene_lut.keys() else: gene_lut = None cano_genes = None num_ignored = 0 keep_bool = { } #Big Speedup: Use to search thesaurus and cano_genes only once for each gene idx = 1 #Used to display progress for line in dfile.lines: #This can be slow, display progress every 5% frac = idx % (len(dfile.lines) / 20) idx += 1 if frac == 0: logging.info("Processing network %d%%", round(100 * float(idx) / len(dfile.lines))) node1 = patches.patch_string_gene(organism_code, line[0]) node2 = patches.patch_string_gene(organism_code, line[1]) for node in (node1, node2): if not node in keep_bool: if cano_genes is not None: keep_bool[node] = node in thesaurus and thesaurus[ node] in cano_genes else: keep_bool[node] = node in thesaurus #Add this node to the lut if it is not already there. if (not gene_lut is None) and (not node in gene_lut): gene_lut[node] = node if node in thesaurus: gene_lut[thesaurus[node]] = node score = float(line[2]) max_score = max(score, max_score) #if can_add_edge(node1, node2, thesaurus, cano_genes): if keep_bool[node1] and keep_bool[node2]: #2/18/15 SD. Translate nodes into names in ratio rows using gene_lut # This will let the ratios matrix define how the genes are named if gene_lut is None: new_edge = (intern(node1), intern(node2), score) else: new_edge = (intern(gene_lut[node1]), intern(gene_lut[node2]), score) #logging.info("Adding edge %s - %s - %f", new_edge[0], new_edge[1], new_edge[2]) result.append(new_edge) else: num_ignored += 1 if not normalized: result = normalize_edges_to_max_score(result, max_score) logging.info("stringdb.read_edges2(), %d edges read, %d edges ignored", len(result), num_ignored) #Write file to be used later? #outfile = util.make_delimited_file_from_lines(lines, sep, has_header, comment, quote) return result
def read_matrix(filename): """reads a matrix file""" infile = util.read_dfile(filename, has_header=True, quote='\"') return dm.DataMatrixFactory([]).create_from( infile, case_sensitive=True).sorted_by_row_name()
def setUp(self): # pylint; disable-msg=C0103 """test fixture""" self.search_distances = {'upstream': (-20, 150)} self.scan_distances = {'upstream': (-30, 250)} matrix_factory = dm.DataMatrixFactory( [dm.nochange_filter, dm.center_scale_filter]) infile = util.read_dfile('example_data/hal/halo_ratios5.tsv', has_header=True, quote='\"') self.ratio_matrix = matrix_factory.create_from(infile) self.organism = testutil.make_halo(self.search_distances, self.scan_distances, self.ratio_matrix) self.config_params = { 'memb.min_cluster_rows_allowed': 3, 'memb.max_cluster_rows_allowed': 70, 'multiprocessing': False, 'num_cores': None, 'memb.clusters_per_row': 2, 'memb.clusters_per_col': int(round(43 * 2.0 / 3.0)), 'num_clusters': 43, 'output_dir': 'out', 'remap_network_nodes': False, 'use_BSCM': False, 'num_iterations': 2000, 'debug': {}, 'search_distances': { 'upstream': (-20, 150) }, 'Columns': { 'schedule': lambda i: True }, 'Rows': { 'schedule': lambda i: True, 'scaling': ('scaling_const', 6.0) }, 'Motifs': { 'schedule': lambda i: True, 'scaling': ('scaling_rvec', 'seq(0, 1, length=num_iterations*3/4)') }, 'MEME': { 'version': '4.3.0', 'global_background': False, 'schedule': lambda i: True, 'nmotifs_rvec': 'c(rep(1, num_iterations/3), rep(2, num_iterations/3))', 'max_width': 24, 'arg_mod': 'zoops', 'background_order': 3, 'use_revcomp': 'True' }, 'Networks': { 'schedule': lambda i: True, 'scaling': ('scaling_rvec', 'seq(1e-5, 0.5, length=num_iterations*3/4)') } } self.membership = self.__read_members() # relies on config_params self.iteration_result = {'iteration': 51, 'score_means': {}}
def test_read_with_tabs_and_header(self): """Reads a tab delimited file with a header""" dfile = util.read_dfile("testdata/simple.tsv", has_header=True) lines = dfile.lines self.assertEquals(1, len(lines)) self.assertEquals(["value11", "value12"], dfile.header)