def login(self): url = data_formats.urls['msigdb']['login1'] self.pre_session = dataio.curl( url, init_url=url, silent=False, cache=False, init_headers=True) url = data_formats.urls['msigdb']['login2'] post = {'j_username': self.user, 'j_password': '******'} self.session = dataio.curl( url, init_url=url, post=post, req_headers=self.pre_session, silent=False, cache=False, init_headers=True)
def load_set(self, setname, map_ids=True): url = data_formats.urls['msigdb']['one_set'] % setname data = dataio.curl(url, req_headers=self.session, silent=True) data = data.split('\n') self.info[setname] = data[1][2:] self.write_set((j for j in (i.strip() for i in data[2:]) if len(j) > 0), setname, 'symbol', map_ids)
def load_collection(self, collname, id_type='entrez', map_ids=True, cachedir='cache'): if os.path.exists(os.path.join(cachedir, 'gsea-%s.pickle' % collname)): self.load([collname]) return None url = self.collections[collname]['urls'][id_type] data = dataio.curl( url, req_headers=self.session, silent=False, cache=False, write_cache=True) data = data.split('\n') names = [] prg = progress.Progress(len(data), 'Loading gene sets', 1) for line in (l.split('\t') for l in data if len(l) > 0): prg.step() setname = line[0].strip() self.write_set(line[2:], setname, id_type, map_ids) self.get_desc(setname) names.append(setname) prg.terminate() self.groups[collname] = set(names) self.save([collname], cachedir=cachedir)
def list_collections(self): renm = re.compile(r'(.+)\([^0-9]*([0-9]*)[^0-9]*\)') url = data_formats.urls['msigdb']['coll'] html = dataio.curl(url, req_headers=self.session, silent=False) soup = bs4.BeautifulSoup(html, 'lxml') for col in soup.find('table', class_='lists1').find_all('tr'): lname, num = renm.findall(col.find('th').text.replace('\n', ''))[0] sname = col.find('a').attrs['name'] urls = dict( [(d.attrs['href'].split('.')[-2], data_formats.urls['msigdb']['url_stem'] % d.attrs['href']) for d in col.find_all('a')[-3:]]) self.collections[sname] = { 'name': lname, 'count': int(num), 'urls': urls }
def get_desc(self, setname): url = data_formats.urls['msigdb']['one_set'] % setname txt = dataio.curl(url, req_headers=self.session, silent=True) self.info[setname] = txt.split('\n')[1][2:]