Exemple #1
0
def make_halo(search_distances, scan_distances, ratios=None):
    """returns the organism object to work on"""
    keggfile = util.read_dfile(KEGG_FILE_PATH, comment='#')
    gofile = util.read_dfile(GO_FILE_PATH)
    rsatdb = rsat.RsatDatabase(RSAT_BASE_URL, CACHE_DIR,
                               'Halobacterium sp', 64091)
    mo_db = microbes_online.MicrobesOnline(CACHE_DIR)
    stringfile = 'testdata/string_links_64091.tab'

    nw_factories = []
    if stringfile != None:
        nw_factories.append(stringdb.get_network_factory('hal', stringfile, 0.5))
    else:
        logging.warn("no STRING file specified !")

    if ratios is not None:
        nw_factories.append(microbes_online.get_network_factory(
            mo_db, max_operon_size=ratios.num_rows / 20, weight=0.5))

    keggorg = util.make_dfile_map(keggfile, 1, 3)['hal']
    rsat_organism = rsatdb.get_rsat_organism(keggorg)
    rsat_info = org.RsatSpeciesInfo(rsatdb, keggorg, rsat_organism, 64091)
    gotax = util.make_dfile_map(gofile, 0, 1)[rsat_info.go_species()]
    return org.Microbe('hal', keggorg, rsat_info, gotax, mo_db, nw_factories,
                       search_distances, scan_distances, True, None)
Exemple #2
0
def make_halo(search_distances, scan_distances, ratios=None):
    """returns the organism object to work on"""
    keggfile = util.read_dfile(KEGG_FILE_PATH, comment='#')
    gofile = util.read_dfile(GO_FILE_PATH)
    rsatdb = rsat.RsatDatabase(RSAT_BASE_URL, CACHE_DIR, 'Halobacterium sp',
                               64091)
    mo_db = microbes_online.MicrobesOnline(CACHE_DIR)
    stringfile = 'testdata/string_links_64091.tab'

    nw_factories = []
    if stringfile != None:
        nw_factories.append(
            stringdb.get_network_factory('hal', stringfile, 0.5))
    else:
        logging.warn("no STRING file specified !")

    if ratios is not None:
        nw_factories.append(
            microbes_online.get_network_factory(
                mo_db, max_operon_size=ratios.num_rows / 20, weight=0.5))

    keggorg = util.make_dfile_map(keggfile, 1, 3)['hal']
    rsat_organism = rsatdb.get_rsat_organism(keggorg)
    rsat_info = org.RsatSpeciesInfo(rsatdb, keggorg, rsat_organism, 64091)
    gotax = util.make_dfile_map(gofile, 0, 1)[rsat_info.go_species()]
    return org.Microbe('hal', keggorg, rsat_info, gotax, mo_db, nw_factories,
                       search_distances, scan_distances, True, None)
 def __get_kegg_data(self):
     # determine the NCBI code
     organism_code = self['organism_code']
     if os.path.exists(USER_KEGG_FILE_PATH):
         keggfile = util.read_dfile(USER_KEGG_FILE_PATH, comment='#')
     elif os.path.exists(SYSTEM_KEGG_FILE_PATH):
         keggfile = util.read_dfile(SYSTEM_KEGG_FILE_PATH, comment='#')
     else:
         raise Exception('KEGG file not found !!')
     kegg_map = util.make_dfile_map(keggfile, 1, 3)
     kegg2ncbi = util.make_dfile_map(keggfile, 1, 2)
     if self['ncbi_code'] is None and organism_code in kegg2ncbi:
         self['ncbi_code'] = kegg2ncbi[organism_code]
     return self['ncbi_code'], kegg_map[organism_code]
Exemple #4
0
 def __get_kegg_data(self):
     # determine the NCBI code
     organism_code = self['organism_code']
     if os.path.exists(USER_KEGG_FILE_PATH):
         keggfile = util.read_dfile(USER_KEGG_FILE_PATH, comment='#')
     elif os.path.exists(SYSTEM_KEGG_FILE_PATH):
         keggfile = util.read_dfile(SYSTEM_KEGG_FILE_PATH, comment='#')
     else:
         raise Exception('KEGG file not found !!')
     kegg_map = util.make_dfile_map(keggfile, 1, 3)
     kegg2ncbi = util.make_dfile_map(keggfile, 1, 2)
     if self['ncbi_code'] is None and organism_code in kegg2ncbi:
         self['ncbi_code'] = kegg2ncbi[organism_code]
     return self['ncbi_code'], kegg_map[organism_code]
 def __make_organism(self):
     """makes a mock organism with almost real data"""
     features = {}
     dfile = util.read_dfile('testdata/Halobacterium_sp_features',
                             comment='--')
     for line in dfile.lines:
         features[line[0]] = st.Feature(
             line[0], line[1], line[2],
             st.Location(line[3], int(line[4]), int(line[5]),
                         line[6] == 'R'))
     tfile = util.read_dfile('testdata/Halobacterium_sp_feature_names',
                             comment='--')
     synonyms = th.create_from_rsat_feature_names(tfile)
     return MockOrganismWithSynonyms('64091', features, synonyms)
 def __make_organism(self):
     """makes a mock organism with almost real data"""
     features = {}
     dfile = util.read_dfile('testdata/Halobacterium_sp_features',
                             comment='--')
     for line in dfile.lines:
         features[line[0]] = st.Feature(line[0], line[1], line[2],
                                        st.Location(line[3],
                                                    int(line[4]),
                                                    int(line[5]),
                                                    line[6] == 'R'))
     tfile = util.read_dfile(
         'testdata/Halobacterium_sp_feature_names', comment='--')
     synonyms = th.create_from_rsat_feature_names(tfile)
     return MockOrganismWithSynonyms('64091', features, synonyms)
Exemple #7
0
def read_ratios(params, args_in):
    """reading ratios matrix"""
    if params['normalize_ratios']:
        if test_data_change(params, args_in) == True:
            #Turn off the nochange_filter if you're resuming a run an have changed the data matrix
            ratio_filters = [dm.center_scale_filter]
        else :
            ratio_filters = [dm.nochange_filter, dm.center_scale_filter]
    else:
        ratio_filters = []

    matrix_factory = dm.DataMatrixFactory(ratio_filters)
    matrix_filename = args_in.ratios

    if matrix_filename.startswith('http://'):
        indata = util.read_url(matrix_filename)
        infile = util.dfile_from_text(indata, has_header=True, quote='\"')
    else:
        infile = util.read_dfile(matrix_filename, has_header=True, quote='\"')

    if params['case_sensitive'] or args_in.case_sensitive:
        ratios = matrix_factory.create_from(infile, True)
    else:
        ratios = matrix_factory.create_from(infile, False)
    return ratios
Exemple #8
0
    def setUp(self):  # pylint; disable-msg=C0103
        """test fixture"""
        self.search_distances = {'upstream': (-20, 150)}
        self.scan_distances = {'upstream': (-30, 250)}

        matrix_factory = dm.DataMatrixFactory(
            [dm.nochange_filter, dm.center_scale_filter])
        infile = util.read_dfile('example_data/hal/halo_ratios5.tsv',
                                 has_header=True,
                                 quote='\"')
        self.ratio_matrix = matrix_factory.create_from(infile)
        self.organism = testutil.make_halo(self.search_distances,
                                           self.scan_distances,
                                           self.ratio_matrix)
        self.config_params = {
            'memb.min_cluster_rows_allowed': 3,
            'memb.max_cluster_rows_allowed': 70,
            'multiprocessing': False,
            'memb.clusters_per_row': 2,
            'memb.clusters_per_col': int(round(43 * 2.0 / 3.0)),
            'num_clusters': 43,
            'num_iterations': 2000
        }
        self.membership = self.__read_members()  # relies on config_params
        self.iteration_result = {'iteration': 51}
Exemple #9
0
 def test_read_with_tabs(self):
     """Reads a tab delimited file"""
     dfile = util.read_dfile("testdata/simple.tsv")
     lines = dfile.lines
     self.assertEquals(["value11", "value12"], lines[0])
     self.assertEquals(["value21", "value22"], lines[1])
     self.assertIsNone(dfile.header)
Exemple #10
0
 def test_read_with_quotes(self):
     """Reads a semicolon delimited file with quotes"""
     dfile = util.read_dfile("testdata/withquotes.ssv", sep=';',
                             has_header=False, comment='#', quote='"')
     lines = dfile.lines
     self.assertEquals(["value11", "value12"], lines[0])
     self.assertEquals(["value21", "value22"], lines[1])
Exemple #11
0
 def test_read_with_tabs(self):
     """Reads a tab delimited file"""
     dfile = util.read_dfile("testdata/simple.tsv")
     lines = dfile.lines
     self.assertEquals(["value11", "value12"], lines[0])
     self.assertEquals(["value21", "value22"], lines[1])
     self.assertIsNone(dfile.header)
Exemple #12
0
 def test_read_with_semicolon_header_and_comments(self):
     """Reads a semicolon delimited file with a header and comments"""
     dfile = util.read_dfile("testdata/withcomments.ssv", sep=';',
                             has_header=True, comment='#')
     lines = dfile.lines
     self.assertEquals(2, len(lines))
     self.assertEquals(["header1", "header2"], dfile.header)
Exemple #13
0
def read_ratios(params, args_in):
    """reading ratios matrix"""
    if params['normalize_ratios']:
        if test_data_change(params, args_in) == True:
            #Turn off the nochange_filter if you're resuming a run an have changed the data matrix
            ratio_filters = [dm.center_scale_filter]
        else:
            ratio_filters = [dm.nochange_filter, dm.center_scale_filter]
    else:
        ratio_filters = []

    matrix_factory = dm.DataMatrixFactory(ratio_filters)
    matrix_filename = args_in.ratios

    if matrix_filename.startswith('http://'):
        indata = util.read_url(matrix_filename).decode('utf-8')
        infile = util.dfile_from_text(indata, has_header=True, quote='\"')
    else:
        infile = util.read_dfile(matrix_filename, has_header=True, quote='\"')

    if params['case_sensitive'] or args_in.case_sensitive:
        ratios = matrix_factory.create_from(infile, True)
    else:
        ratios = matrix_factory.create_from(infile, False)
    return ratios
Exemple #14
0
def prepare_ensemble_matrix(ratiofile, outdir, n, kmin):
    matrix_factory = DataMatrixFactory([nochange_filter,
                                        center_scale_filter])
    if os.path.exists(ratiofile):
        infile = util.read_dfile(ratiofile, has_header=True, quote='\"')
        matrix = matrix_factory.create_from(infile)
        split_matrix(matrix, outdir, n, kmin, matrix.num_columns)
    def test_motif_scoring(self):
        """tests the motif scoring in integration"""
        search_distances = {"upstream": (-20, 150)}
        scan_distances = {"upstream": (-30, 250)}

        matrix_factory = dm.DataMatrixFactory([dm.nochange_filter, dm.center_scale_filter])
        infile = util.read_dfile("example_data/hal/halo_ratios5.tsv", has_header=True, quote='"')
        ratio_matrix = matrix_factory.create_from(infile)
        organism = testutil.make_halo(search_distances, scan_distances, ratio_matrix)
        membership = FakeMembership()
        config_params = {
            "memb.min_cluster_rows_allowed": 3,
            "memb.max_cluster_rows_allowed": 70,
            "multiprocessing": False,
            "num_clusters": 1,
            "output_dir": "out",
            "debug": {},
            "search_distances": {"upstream": (-20, 150)},
            "num_iterations": 2000,
            "MEME": {
                "schedule": lambda i: True,
                "version": "4.3.0",
                "global_background": False,
                "arg_mod": "zoops",
                "nmotifs_rvec": "c(rep(1, num_iterations/3), rep(2, num_iterations/3))",
                "use_revcomp": "True",
                "max_width": 24,
                "background_order": 3,
            },
            "Motifs": {"schedule": lambda i: True, "scaling": ("scaling_const", 1.0)},
        }
        func = motif.MemeScoringFunction(organism, membership, ratio_matrix, config_params=config_params)
        iteration_result = {"iteration": 100}
        matrix = func.compute(iteration_result)
Exemple #16
0
 def __make_ref_operon_pairs(self):
     """returns reference operon pairs for comparison"""
     reffile = util.read_dfile('testdata/operon_reftable.tsv',
                               has_header=True, quote='"')
     refpairs = []
     for line in reffile.lines:
         refpairs.append((line[1], line[2]))
     return refpairs
Exemple #17
0
 def test_read_with_semicolon_header_and_comments(self):
     """Reads a semicolon delimited file with a header and comments"""
     dfile = util.read_dfile("testdata/withcomments.ssv",
                             sep=';',
                             has_header=True,
                             comment='#')
     lines = dfile.lines
     self.assertEquals(2, len(lines))
     self.assertEquals(["header1", "header2"], dfile.header)
Exemple #18
0
 def test_read_with_empty_lines(self):
     """Reads a semicolon delimited file containing emptylines"""
     dfile = util.read_dfile("testdata/withemptylines.ssv", sep=';',
                             has_header=True, comment='#', quote='"')
     lines = dfile.lines
     self.assertEquals(["header1", "header2"], dfile.header)
     self.assertEquals(2, len(lines))
     self.assertEquals(["value11", "value12"], lines[0])
     self.assertEquals(["value21", "value22"], lines[1])
def make_microbe(code):
    """assemble organism related information and return it to the caller"""
    keggfile = util.read_dfile(KEGG_FILE_PATH, comment='#')
    rsatdb = rsat.RsatDatabase(RSAT_BASE_URL, CACHE_DIR)
    kegg_mapper = org.make_kegg_code_mapper(keggfile)
    rsat_mapper = org.make_rsat_organism_mapper(rsatdb)
    rsat_info = rsat_mapper(kegg_mapper(code))
    microbedb = MicrobeDB(keggfile, rsatdb, rsat_info)
    print "NCBI CODE IS: ", rsat_info.taxonomy_id
    gofile = util.read_dfile(GO_FILE_PATH)
    mo_db = microbes_online.MicrobesOnline()
    search_distances = {'upstream': (-20, 150)}
    scan_distances = {'upstream': (-30, 250)}
    org_factory = org.MicrobeFactory(kegg_mapper, rsat_mapper,
                                     org.make_go_taxonomy_mapper(gofile),
                                     mo_db, [])
    organism = org_factory.create(code, search_distances, scan_distances)
    return microbedb, organism
Exemple #20
0
 def __make_ref_operon_pairs(self):
     """returns reference operon pairs for comparison"""
     reffile = util.read_dfile('testdata/operon_reftable.tsv',
                               has_header=True,
                               quote='"')
     refpairs = []
     for line in reffile.lines:
         refpairs.append((line[1], line[2]))
     return refpairs
Exemple #21
0
 def test_read_with_quotes(self):
     """Reads a semicolon delimited file with quotes"""
     dfile = util.read_dfile("testdata/withquotes.ssv",
                             sep=';',
                             has_header=False,
                             comment='#',
                             quote='"')
     lines = dfile.lines
     self.assertEquals(["value11", "value12"], lines[0])
     self.assertEquals(["value21", "value22"], lines[1])
def make_microbe(code):
    """assemble organism related information and return it to the caller"""
    keggfile = util.read_dfile(KEGG_FILE_PATH, comment='#')
    rsatdb = rsat.RsatDatabase(RSAT_BASE_URL, CACHE_DIR)
    kegg_mapper = org.make_kegg_code_mapper(keggfile)
    rsat_info = org.RsatSpeciesInfo(rsatdb, kegg_mapper(code), None, None)
    microbedb = MicrobeDB(keggfile, rsatdb, rsat_info)
    print "NCBI CODE IS: ", rsat_info.taxonomy_id
    gofile = util.read_dfile(GO_FILE_PATH)
    mo_db = microbes_online.MicrobesOnline()
    search_distances= {'upstream': (-20, 150)}
    scan_distances = {'upstream': (-30, 250)}
    org_factory = org.MicrobeFactory(kegg_mapper,
                                     rsat_mapper,
                                     org.make_go_taxonomy_mapper(gofile),
                                     mo_db,
                                     [])
    organism = org_factory.create(code, search_distances, scan_distances)
    return microbedb, organism
Exemple #23
0
 def test_read_with_empty_lines(self):
     """Reads a semicolon delimited file containing emptylines"""
     dfile = util.read_dfile("testdata/withemptylines.ssv",
                             sep=';',
                             has_header=True,
                             comment='#',
                             quote='"')
     lines = dfile.lines
     self.assertEquals(["header1", "header2"], dfile.header)
     self.assertEquals(2, len(lines))
     self.assertEquals(["value11", "value12"], lines[0])
     self.assertEquals(["value21", "value22"], lines[1])
Exemple #24
0
    def __get_kegg_data(self):
        # determine the NCBI code
        organism_code = self.config_params['organism_code']

        try:
            kegg_path = resource_filename(Requirement.parse("cmonkey2"), USER_KEGG_FILE_PATH)
        except DistributionNotFound:
            kegg_path = USER_KEGG_FILE_PATH

        keggfile = util.read_dfile(kegg_path, comment='#')
        kegg_map = util.make_dfile_map(keggfile, 1, 3)
        kegg2ncbi = util.make_dfile_map(keggfile, 1, 2)
        if self.config_params['ncbi_code'] is None and organism_code in kegg2ncbi:
            self.config_params['ncbi_code'] = kegg2ncbi[organism_code]
        return self.config_params['ncbi_code'], kegg_map[organism_code]
Exemple #25
0
    def __get_kegg_data(self):
        # determine the NCBI code
        organism_code = self.config_params['organism_code']

        try:
            kegg_path = resource_filename(Requirement.parse("cmonkey2"), USER_KEGG_FILE_PATH)
        except DistributionNotFound:
            kegg_path = USER_KEGG_FILE_PATH

        keggfile = util.read_dfile(kegg_path, comment='#')
        kegg_map = util.make_dfile_map(keggfile, 1, 3)
        kegg2ncbi = util.make_dfile_map(keggfile, 1, 2)
        if self.config_params['ncbi_code'] is None and organism_code in kegg2ncbi:
            self.config_params['ncbi_code'] = kegg2ncbi[organism_code]
        return self.config_params['ncbi_code'], kegg_map[organism_code]
Exemple #26
0
def create_from_delimited_file2(dfile, case_sensitive):
    """creates a thesaurus from a delimited file where the format is
    <original>SEPARATOR<alt1>;<alt2>;...
    ..."""
    def fix_case(s):
        return s if case_sensitive else s.upper()

    if isinstance(dfile, str):
        dfile = util.read_dfile(dfile, sep=',', has_header=False)
    result = {}
    for line in dfile.lines:
        original = fix_case(line[0])  # original should map to itself
        result[original] = original
        for alternative in line[1].split(';'):
            result[fix_case(alternative)] = original
    return result
Exemple #27
0
def create_from_delimited_file2(dfile, case_sensitive):
    """creates a thesaurus from a delimited file where the format is
    <original>SEPARATOR<alt1>;<alt2>;...
    ..."""
    def fix_case(s):
        return s if case_sensitive else s.upper()

    if isinstance(dfile, str):
        dfile = util.read_dfile(dfile, sep=',', has_header=False)
    result = {}
    for line in dfile.lines:
        original = fix_case(line[0])  # original should map to itself
        result[original] = original
        for alternative in line[1].split(';'):
            result[fix_case(alternative)] = original
    return result
Exemple #28
0
    def test_motif_scoring(self):
        """tests the motif scoring in integration"""
        search_distances = {'upstream': (-20, 150)}
        scan_distances = {'upstream': (-30, 250)}

        matrix_factory = dm.DataMatrixFactory(
            [dm.nochange_filter, dm.center_scale_filter])
        infile = util.read_dfile('example_data/hal/halo_ratios5.tsv',
                                 has_header=True,
                                 quote='\"')
        ratio_matrix = matrix_factory.create_from(infile)
        organism = testutil.make_halo(search_distances, scan_distances,
                                      ratio_matrix)
        membership = FakeMembership()
        config_params = {
            'memb.min_cluster_rows_allowed': 3,
            'memb.max_cluster_rows_allowed': 70,
            'multiprocessing': False,
            'num_clusters': 1,
            'output_dir': 'out',
            'debug': {},
            'search_distances': {
                'upstream': (-20, 150)
            },
            'num_iterations': 2000,
            'MEME': {
                'schedule': lambda i: True,
                'version': '4.3.0',
                'global_background': False,
                'arg_mod': 'zoops',
                'nmotifs_rvec':
                'c(rep(1, num_iterations/3), rep(2, num_iterations/3))',
                'use_revcomp': 'True',
                'max_width': 24,
                'background_order': 3
            },
            'Motifs': {
                'schedule': lambda i: True,
                'scaling': ('scaling_const', 1.0)
            }
        }
        func = motif.MemeScoringFunction(organism,
                                         membership,
                                         ratio_matrix,
                                         config_params=config_params)
        iteration_result = {'iteration': 100}
        matrix = func.compute(iteration_result)
Exemple #29
0
    def setUp(self):  # pylint; disable-msg=C0103
        """test fixture"""
        self.search_distances = {'upstream': (-20, 150)}
        self.scan_distances = {'upstream': (-30, 250)}

        matrix_factory = dm.DataMatrixFactory([dm.nochange_filter, dm.center_scale_filter])
        infile = util.read_dfile('example_data/hal/halo_ratios5.tsv',
                                 has_header=True, quote='\"')
        self.ratio_matrix = matrix_factory.create_from(infile)
        self.organism = testutil.make_halo(self.search_distances, self.scan_distances,
                                           self.ratio_matrix)
        self.config_params = {'memb.min_cluster_rows_allowed': 3,
                              'memb.max_cluster_rows_allowed': 70,
                              'multiprocessing': False,
                              'memb.clusters_per_row': 2,
                              'memb.clusters_per_col': int(round(43 * 2.0 / 3.0)),
                              'num_clusters': 43,
                              'num_iterations': 2000}
        self.membership = self.__read_members()  # relies on config_params
        self.iteration_result = { 'iteration': 51 }
Exemple #30
0
def read_ratios(params, args):
    """reading ratios matrix"""
    if params['normalize_ratios']:
        ratio_filters = [dm.nochange_filter, dm.center_scale_filter]
    else:
        ratio_filters = []

    matrix_factory = dm.DataMatrixFactory(ratio_filters)
    matrix_filename = args.ratios

    if matrix_filename.startswith('http://'):
        indata = util.read_url(matrix_filename)
        infile = util.dfile_from_text(indata, has_header=True, quote='\"')
    else:
        infile = util.read_dfile(matrix_filename, has_header=True, quote='\"')

    if params['case_sensitive'] or args.case_sensitive:
        ratios = matrix_factory.create_from(infile, True)
    else:
        ratios = matrix_factory.create_from(infile, False)
    return ratios
    def setUp(self):  # pylint; disable-msg=C0103
        """test fixture"""
        self.search_distances = {'upstream': (-20, 150)}
        self.scan_distances = {'upstream': (-30, 250)}

        matrix_factory = dm.DataMatrixFactory([dm.nochange_filter, dm.center_scale_filter])
        infile = util.read_dfile('example_data/hal/halo_ratios5.tsv',
                                 has_header=True, quote='\"')
        self.ratio_matrix = matrix_factory.create_from(infile)
        self.organism = testutil.make_halo(self.search_distances, self.scan_distances,
                                           self.ratio_matrix)
        self.config_params = {'memb.min_cluster_rows_allowed': 3,
                              'memb.max_cluster_rows_allowed': 70,
                              'multiprocessing': False,
                              'num_cores': None,
                              'memb.clusters_per_row': 2,
                              'memb.clusters_per_col': int(round(43 * 2.0 / 3.0)),
                              'num_clusters': 43,
                              'output_dir': 'out',
                              'remap_network_nodes': False,
                              'use_BSCM': False,
                              'num_iterations': 2000,
                              'debug': {},
                              'search_distances': {'upstream': (-20, 150)},
                              'Columns': {'schedule': lambda i: True },
                              'Rows': {'schedule': lambda i: True, 'scaling': ('scaling_const', 6.0) },
                              'Motifs': {'schedule': lambda i: True,
                                         'scaling': ('scaling_rvec', 'seq(0, 1, length=num_iterations*3/4)')},
                              'MEME': {'version': '4.3.0',
                                       'global_background': False,
                                       'schedule': lambda i: True,
                                       'nmotifs_rvec': 'c(rep(1, num_iterations/3), rep(2, num_iterations/3))',
                                       'max_width': 24, 'arg_mod': 'zoops',
                                       'background_order': 3, 'use_revcomp': 'True'},
                              'Networks': {'schedule': lambda i: True, 'scaling': ('scaling_rvec', 'seq(1e-5, 0.5, length=num_iterations*3/4)')}}
        self.membership = self.__read_members()  # relies on config_params
        self.iteration_result = { 'iteration': 51, 'score_means': {} }
Exemple #32
0
    def read_edges2(filename, organism, ratios):
        """just read a preprocessed file, much faster to debug"""
        logging.info("stringdb.read_edges2()")
        dfile = util.read_dfile(filename, sep)
        logging.info("Finished loading %s", filename)
        result = []
        max_score = 0.0
        thesaurus = organism.thesaurus()
        if ratios:
            gene_lut = {}
            for row_name in ratios.row_names:
                if row_name in thesaurus:
                    gene_lut[thesaurus[row_name]] = row_name
                gene_lut[row_name] = row_name #A node should always map to itself
            cano_genes = gene_lut.keys()
        else:
            gene_lut = None
            cano_genes = None

        num_ignored = 0
        keep_node = {}  # Big Speedup: Use to search thesaurus and cano_genes only once for each gene
        idx = 1  # Used to display progress
        total_nodes = 0
        nodes_not_in_thesaurus = 0
        nodes_not_in_cano_genes = 0

        for line in dfile.lines:
            #This can be slow, display progress every 5%
            frac = idx % (len(dfile.lines)/20)
            idx += 1
            if frac == 0:
                logging.info("Processing network %d%%", round(100*float(idx)/len(dfile.lines)))

            node1 = patches.patch_string_gene(organism_code, line[0])
            node2 = patches.patch_string_gene(organism_code, line[1])
            for node in (node1, node2):
                if not node in keep_node:
                    if cano_genes is not None:
                        keep_node[node] = node in thesaurus and thesaurus[node] in cano_genes
                    else:
                        keep_node[node] = node in thesaurus
                    if not keep_node[node]:
                        if not node in thesaurus:
                            nodes_not_in_thesaurus += 1
                        elif not thesaurus[node] in cano_genes:
                            nodes_not_in_cano_genes += 1

                    # Add this node to the lut if it is not already there.
                    if (not gene_lut is None) and (not node in gene_lut):
                        gene_lut[node] = node
                        if node in thesaurus:
                            gene_lut[thesaurus[node]] = node
                    total_nodes += 1

            score = float(line[2])
            max_score = max(score, max_score)

            if keep_node[node1] and keep_node[node2]:
                #2/18/15 SD.  Translate nodes into names in ratio rows using gene_lut
                #   This will let the ratios matrix define how the genes are named
                if gene_lut is None:
                    new_edge = (node1, node2, score)
                else:
                    new_edge = (gene_lut[node1], gene_lut[node2], score)
                #logging.info("Adding edge %s - %s - %f", new_edge[0], new_edge[1], new_edge[2])
                result.append(new_edge)
            else:
                num_ignored += 1

        # Warnings
        if nodes_not_in_thesaurus > 0:
            logging.warn('%d (out of %d) nodes not found in synonyms', nodes_not_in_thesaurus, total_nodes)
        if nodes_not_in_cano_genes > 0:
            logging.warn('%d (out of %d) nodes not found in canonical gene names', nodes_not_in_cano_genes, total_nodes)

        if not normalized:
            result = normalize_edges_to_max_score(result, max_score)

        logging.info("stringdb.read_edges2(), %d edges read, %d edges ignored",
                     len(result), num_ignored)

        return result
Exemple #33
0
        if not args.string and not args.operons:
            args.nonetworks = True

    # user overrides in config files
    if args.config:
        config.read(args.config)

    matrix_factory = dm.DataMatrixFactory([dm.nochange_filter,
                                           dm.center_scale_filter])
    matrix_filename = args.ratios

    if matrix_filename.startswith('http://'):
        indata = util.read_url(matrix_filename)
        infile = util.dfile_from_text(indata, has_header=True, quote='\"')
    else:
        infile = util.read_dfile(matrix_filename, has_header=True, quote='\"')

    matrix = matrix_factory.create_from(infile)
    infile = None

    # override number of clusters either on the command line or through
    # the config file
    try:
        num_clusters = config.getint("General", "num_clusters")
    except:
        num_clusters = args.numclusters

    cmonkey_run = cmr.CMonkeyRun(args.organism, matrix,
                                 string_file=args.string,
                                 rsat_organism=args.rsat_organism,
                                 log_filename=args.logfile,
 def __read_ratios(self):
     dfile = util.read_dfile('testdata/row_scores_testratios.tsv',
                             has_header=True)
     return dm.DataMatrixFactory([]).create_from(dfile, case_sensitive=True)
 def __read_colscores_refresult(self):
     dfile = util.read_dfile('testdata/column_scores_refresult.tsv',
                             has_header=True, quote='"')
     return dm.DataMatrixFactory([]).create_from(dfile, case_sensitive=True)
    def make_organism(self):
        """returns the organism object to work on"""
        self.__make_dirs_if_needed()
        ncbi_code, kegg_species = self.__get_kegg_data()

        if os.path.exists(USER_GO_FILE_PATH):
            gofile = util.read_dfile(USER_GO_FILE_PATH)
        elif os.path.exists(SYSTEM_GO_FILE_PATH):
            gofile = util.read_dfile(SYSTEM_GO_FILE_PATH)
        else:
            raise Exception('GO file not found !!')

        if self['rsat_dir']:
            if not self['rsat_organism']:
                raise Exception('override RSAT loading: please specify --rsat_organism')
            logging.info("using RSAT files for '%s'", self['rsat_organism'])
            rsatdb = rsat.RsatFiles(self['rsat_dir'], self['rsat_organism'], ncbi_code, self['rsat_features'], self['rsat_base_url'])
        else:
            rsatdb = rsat.RsatDatabase(self['rsat_base_url'], self['cache_dir'], kegg_species, ncbi_code, self['rsat_features'])

        if self['operon_file']:
            logging.info("using operon file at '%s'", self['operon_file'])
            mo_db = microbes_online.MicrobesOnlineOperonFile(self['operon_file'])
        else:
            logging.info("attempting automatic download of operons from Microbes Online")
            mo_db = microbes_online.MicrobesOnline(self['cache_dir'])

        stringfile = self['string_file']
        nw_factories = []
        is_microbe = self['organism_code'] not in VERTEBRATES

        # determine the final weights. note: for now, we will just check whether
        # we have 1 or 2 networks
        num_networks = 0
        if not self['nonetworks'] and self['use_string']:
            num_networks += 1
        if is_microbe and not self['nonetworks'] and self['use_operons']:
            num_networks += 1
        network_weight = 0.0
        if num_networks > 0:
            network_weight = 1.0 / num_networks

        # do we use STRING ?
        if not self['nonetworks'] and self['use_string']:
            # download if not provided
            if stringfile is None:
                if ncbi_code is None:
                    rsat_info = org.RsatSpeciesInfo(rsatdb, kegg_species,
                                                    self['rsat_organism'], None)
                    ncbi_code = rsat_info.taxonomy_id

                logging.info("NCBI CODE IS: %s", ncbi_code)
                url = STRING_URL_PATTERN % ncbi_code
                stringfile = "%s/%s.gz" % (self['cache_dir'], ncbi_code)
                self['string_file'] = stringfile
                logging.info("Automatically using STRING file in '%s' (URL: %s)",
                             stringfile, url)
                util.get_url_cached(url, stringfile)
            else:
                logging.info("Loading STRING file at '%s'", stringfile)

            # create and add network
            nw_factories.append(stringdb.get_network_factory(
                self['organism_code'], stringfile, network_weight))

        # do we use operons ?
        if is_microbe and not self['nonetworks'] and self['use_operons']:
            logging.debug('adding operon network factory')
            nw_factories.append(microbes_online.get_network_factory(
                mo_db, max_operon_size=self.ratios.num_rows / 20,
                weight=network_weight))

        orgcode = self['organism_code']
        logging.debug("Creating Microbe object for '%s'", orgcode)
        rsat_info = org.RsatSpeciesInfo(rsatdb, kegg_species, self['rsat_organism'],
                                        ncbi_code)
        gotax = util.make_dfile_map(gofile, 0, 1)[rsat_info.go_species()]
        synonyms = None
        if self['synonym_file'] is not None:
            synonyms = thesaurus.create_from_delimited_file2(self['synonym_file'],
                                                             self['case_sensitive'])

        #New logic: test to see if there's a fastafile.  If not, then
        #Download it from rsat, process it, and then return the new file name

        is_microbe = True
        if is_microbe:
           organism = org.Microbe(orgcode, kegg_species, rsat_info, gotax, mo_db,
                                   nw_factories,
                                   self['search_distances'], self['scan_distances'],
                                   self['use_operons'], self.ratios, synonyms,
                                   self['fasta_file'])
        else:
            organism = org.RSATOrganism(orgcode, kegg_species, rsat_info, gotax,
                                        nw_factories,
                                        self['search_distances'], self['scan_distances'],
                                        self.ratios, synonyms,
                                        self['fasta_file'])

        conn = self.__dbconn()
        with conn:
            for network in organism.networks():
                conn.execute("insert into statstypes values ('network',?)", [network.name])
            for sequence_type in self['sequence_types']:
                conn.execute("insert into statstypes values ('seqtype',?)", [sequence_type])

        return organism
 def __read_colscores_refresult(self):
     dfile = util.read_dfile('testdata/column_scores_refresult.tsv',
                             has_header=True,
                             quote='"')
     return dm.DataMatrixFactory([]).create_from(dfile, case_sensitive=True)
    def make_organism(self):
        """returns the organism object to work on"""
        self.__make_dirs_if_needed()
        ncbi_code, kegg_species = self.__get_kegg_data()

        try:
            go_file_path = resource_filename(Requirement.parse("cmonkey2"),
                                             USER_GO_FILE_PATH)
        except DistributionNotFound:
            go_file_path = USER_GO_FILE_PATH

        gofile = util.read_dfile(go_file_path)

        if self['rsat_dir']:
            if not self['rsat_organism']:
                raise Exception(
                    'override RSAT loading: please specify --rsat_organism')
            logging.info("using RSAT files for '%s'", self['rsat_organism'])
            rsatdb = rsat.RsatFiles(self['rsat_dir'], self['rsat_organism'],
                                    ncbi_code, self['rsat_features'],
                                    self['rsat_base_url'])
        else:
            rsatdb = rsat.RsatDatabase(self['rsat_base_url'],
                                       self['cache_dir'], kegg_species,
                                       ncbi_code, self['rsat_features'])

        if self['operon_file']:
            logging.info("using operon file at '%s'", self['operon_file'])
            mo_db = microbes_online.MicrobesOnlineOperonFile(
                self['operon_file'])
        else:
            logging.info(
                "attempting automatic download of operons from Microbes Online"
            )
            mo_db = microbes_online.MicrobesOnline(self['cache_dir'])

        stringfile = self['string_file']
        nw_factories = []
        is_microbe = self['organism_code'] not in VERTEBRATES

        # determine the final weights. note: for now, we will just check whether
        # we have 1 or 2 networks
        num_networks = 0
        if not self['nonetworks'] and self['use_string']:
            num_networks += 1
        if is_microbe and not self['nonetworks'] and self['use_operons']:
            num_networks += 1
        network_weight = 0.0
        if num_networks > 0:
            network_weight = 1.0 / num_networks

        # do we use STRING ?
        if not self['nonetworks'] and self['use_string']:
            # download if not provided
            if stringfile is None:
                if ncbi_code is None:
                    rsat_info = org.RsatSpeciesInfo(rsatdb, kegg_species,
                                                    self['rsat_organism'],
                                                    None)
                    ncbi_code = rsat_info.taxonomy_id

                logging.info("NCBI CODE IS: %s", ncbi_code)
                url = STRING_URL_PATTERN % ncbi_code
                stringfile = "%s/%s.gz" % (self['cache_dir'], ncbi_code)
                self['string_file'] = stringfile
                logging.info(
                    "Automatically using STRING file in '%s' (URL: %s)",
                    stringfile, url)
                util.get_url_cached(url, stringfile)
            else:
                logging.info("Loading STRING file at '%s'", stringfile)

            # create and add network
            nw_factories.append(
                stringdb.get_network_factory(self['organism_code'], stringfile,
                                             network_weight))

        # do we use operons ?
        if is_microbe and not self['nonetworks'] and self['use_operons']:
            logging.debug('adding operon network factory')
            nw_factories.append(
                microbes_online.get_network_factory(
                    mo_db,
                    max_operon_size=self.ratios.num_rows / 20,
                    weight=network_weight))

        orgcode = self['organism_code']
        logging.debug("Creating Microbe object for '%s'", orgcode)
        rsat_info = org.RsatSpeciesInfo(rsatdb, kegg_species,
                                        self['rsat_organism'], ncbi_code)
        gotax = util.make_dfile_map(gofile, 0, 1)[rsat_info.go_species()]
        synonyms = None
        if self['synonym_file'] is not None:
            synonyms = thesaurus.create_from_delimited_file2(
                self['synonym_file'], self['case_sensitive'])

        #New logic: test to see if there's a fastafile.  If not, then
        #Download it from rsat, process it, and then return the new file name

        is_microbe = True
        if is_microbe:
            organism = org.Microbe(orgcode, kegg_species, rsat_info, gotax,
                                   mo_db, nw_factories,
                                   self['search_distances'],
                                   self['scan_distances'], self['use_operons'],
                                   self.ratios, synonyms, self['fasta_file'])
        else:
            organism = org.RSATOrganism(orgcode, kegg_species, rsat_info,
                                        gotax, nw_factories,
                                        self['search_distances'],
                                        self['scan_distances'], self.ratios,
                                        synonyms, self['fasta_file'])

        conn = self.__dbconn()
        with conn:
            for network in organism.networks():
                conn.execute("insert into statstypes values ('network',?)",
                             [network.name])
            for sequence_type in self['sequence_types']:
                conn.execute("insert into statstypes values ('seqtype',?)",
                             [sequence_type])

        return organism
def read_matrix(filename):
    """reads a matrix file"""
    infile = util.read_dfile(filename, has_header=True, quote='\"')
    return dm.DataMatrixFactory([]).create_from(infile, case_sensitive=True).sorted_by_row_name()
 def __read_ratios(self):
     dfile = util.read_dfile('testdata/row_scores_testratios.tsv',
                             has_header=True)
     return dm.DataMatrixFactory([]).create_from(dfile, case_sensitive=True)
Exemple #41
0
 def test_read_with_tabs_and_header(self):
     """Reads a tab delimited file with a header"""
     dfile = util.read_dfile("testdata/simple.tsv", has_header=True)
     lines = dfile.lines
     self.assertEquals(1, len(lines))
     self.assertEquals(["value11", "value12"], dfile.header)
Exemple #42
0
def read_matrix(filename):
    """reads a matrix file"""
    infile = util.read_dfile(filename, has_header=True, quote='\"')
    return dm.DataMatrixFactory([]).create_from(
        infile, case_sensitive=True).sorted_by_row_name()
Exemple #43
0
 def test_read_with_tabs_and_header(self):
     """Reads a tab delimited file with a header"""
     dfile = util.read_dfile("testdata/simple.tsv", has_header=True)
     lines = dfile.lines
     self.assertEquals(1, len(lines))
     self.assertEquals(["value11", "value12"], dfile.header)
Exemple #44
0
    def setUp(self):  # pylint; disable-msg=C0103
        """test fixture"""
        self.search_distances = {'upstream': (-20, 150)}
        self.scan_distances = {'upstream': (-30, 250)}

        matrix_factory = dm.DataMatrixFactory(
            [dm.nochange_filter, dm.center_scale_filter])
        infile = util.read_dfile('example_data/hal/halo_ratios5.tsv',
                                 has_header=True,
                                 quote='\"')
        self.ratio_matrix = matrix_factory.create_from(infile)
        self.organism = testutil.make_halo(self.search_distances,
                                           self.scan_distances,
                                           self.ratio_matrix)
        self.config_params = {
            'memb.min_cluster_rows_allowed': 3,
            'memb.max_cluster_rows_allowed': 70,
            'multiprocessing': False,
            'num_cores': None,
            'memb.clusters_per_row': 2,
            'memb.clusters_per_col': int(round(43 * 2.0 / 3.0)),
            'num_clusters': 43,
            'output_dir': 'out',
            'remap_network_nodes': False,
            'use_BSCM': False,
            'num_iterations': 2000,
            'debug': {},
            'search_distances': {
                'upstream': (-20, 150)
            },
            'Columns': {
                'schedule': lambda i: True
            },
            'Rows': {
                'schedule': lambda i: True,
                'scaling': ('scaling_const', 6.0)
            },
            'Motifs': {
                'schedule': lambda i: True,
                'scaling':
                ('scaling_rvec', 'seq(0, 1, length=num_iterations*3/4)')
            },
            'MEME': {
                'version': '4.3.0',
                'global_background': False,
                'schedule': lambda i: True,
                'nmotifs_rvec':
                'c(rep(1, num_iterations/3), rep(2, num_iterations/3))',
                'max_width': 24,
                'arg_mod': 'zoops',
                'background_order': 3,
                'use_revcomp': 'True'
            },
            'Networks': {
                'schedule':
                lambda i: True,
                'scaling':
                ('scaling_rvec', 'seq(1e-5, 0.5, length=num_iterations*3/4)')
            }
        }
        self.membership = self.__read_members()  # relies on config_params
        self.iteration_result = {'iteration': 51, 'score_means': {}}
Exemple #45
0
        if not args.string and not args.operons:
            args.nonetworks = True

    # user overrides in config files
    if args.config:
        config.read(args.config)

    matrix_factory = dm.DataMatrixFactory(
        [dm.nochange_filter, dm.center_scale_filter])
    matrix_filename = args.ratios

    if matrix_filename.startswith('http://'):
        indata = util.read_url(matrix_filename)
        infile = util.dfile_from_text(indata, has_header=True, quote='\"')
    else:
        infile = util.read_dfile(matrix_filename, has_header=True, quote='\"')

    matrix = matrix_factory.create_from(infile)
    infile = None

    # override number of clusters either on the command line or through
    # the config file
    try:
        num_clusters = config.getint("General", "num_clusters")
    except:
        num_clusters = args.numclusters

    cmonkey_run = cmr.CMonkeyRun(args.organism,
                                 matrix,
                                 string_file=args.string,
                                 rsat_organism=args.rsat_organism,
Exemple #46
0
def prepare_ensemble_matrix(ratiofile, outdir, n, kmin):
    matrix_factory = DataMatrixFactory([nochange_filter, center_scale_filter])
    if os.path.exists(ratiofile):
        infile = util.read_dfile(ratiofile, has_header=True, quote='\"')
        matrix = matrix_factory.create_from(infile)
        split_matrix(matrix, outdir, n, kmin, matrix.num_columns)
Exemple #47
0
                value = ratios.values[row][col]
                outfile.write("%d\t%d\t%f\n" % (gene_id, cond_id, value))

if __name__ == '__main__':
    description = 'addnwportal.py - adding a cMonkey/python run to the database'
    parser = argparse.ArgumentParser(description=description)
    parser.add_argument('--resultdir', required=True, help='cMonkey result directory')
    parser.add_argument('--exptable', help='filename of expression table to generate',
                        default=None)
    args = parser.parse_args()
    resultdb = os.path.join(args.resultdir, 'cmonkey_run.db')
    ratiofile = os.path.join(args.resultdir, 'ratios.tsv.gz')

    # read the matrix
    matrix_factory = dm.DataMatrixFactory([dm.nochange_filter, dm.center_scale_filter])
    infile = util.read_dfile(ratiofile, has_header=True, quote='\"')
    ratios = matrix_factory.create_from(infile)

    # access the run information
    conn = sqlite3.connect(resultdb)
    cursor = conn.cursor()
    cursor.execute('select organism, species, num_iterations, num_clusters from run_infos')
    orgcode, species, num_iterations, num_clusters = cursor.fetchone()
    print "organism: %s species: %s iterations: %d clusters: %d" % (orgcode, species,
                                                                    num_iterations,
                                                                    num_clusters)

    # start populating the database
    microbedb, organism = make_microbe(orgcode)
    ncbi_code = microbedb.rsat_info.taxonomy_id
    ucsc_code = UCSC_MAP[orgcode]