def parse_res_list(cont): ret = [] if cont == '': return set([]) for url in str_util.cut_windows(cont, 'http%3a%2f%2f', '.pdf'): if url.find('<') >= 0 or url.find('>') >= 0: continue ret.append('http://%s.pdf' % url) for url in str_util.cut_windows(cont, 'https%3a%2f%2f', '.pdf'): if url.find('<') >= 0 or url.find('>') >= 0: continue ret.append('https://%s.pdf' % url) return set(ret)
def get_department(cont): r1 = str_util.cut_windows(cont, '就诊科室:</span><span class=\"fl txt-right\">', '</span></p>') if len(r1) > 0: return r1[0].strip() return ''
def get_group(cont): r1 = str_util.cut_windows(cont, '易感人群:</span><span class=\"fl txt-right\">', '</span></p>') if len(r1) > 0: return r1[0].strip() return ''
def parse_link(self, ori_url, page): #for search results new_url_list = [] if ori_url.startswith('https://search.yahoo.com/search'): try: page = page.decode('utf-8') except: print('page decode failed', file=sys.stderr) for link in str_util.cut_windows(page, 'http%3a%2f%2f', '.pdf'): if link.find('<') >= 0 or link.find('>') >= 0: continue link = 'http://%s.pdf' % link.replace('%2f', '/') new_url_list.append(link) soup = BeautifulSoup(page, 'html.parser') for a in soup.findAll('a', href=True): link = a['href'] if link != '' and link.startswith('/'): link = get_site(ori_url) + link if not link.startswith('http:'): link = 'http://' + link valid = self.check_link(link) if self.debug: print('link\t%s\t%s\t%d' % (ori_url, link, valid), file=sys.stderr) if not valid: continue new_url_list.append(link) link_num = 0 lock.acquire() for link in new_url_list: url_visited = check_key(self.url_db_dir, link) if url_visited: continue key = bytes(link, encoding='utf-8') add_kv(self.url_db_dir, key, b'') if len(self.url_queue) < self.queue_max_size: self.url_queue.append(link) link_num += 1 else: print('warning: queeu size exceed', file=sys.stderr) lock.release() return link_num
def parse_link(self, ori_url, page): #for search results link_num = 0 if ori_url.startswith('https://search.yahoo.com/search'): try: page = page.decode('utf-8') except: print('page decode failed', file = sys.stderr) for link in str_util.cut_windows(page, 'http%3a%2f%2f', '.pdf'): if link.find('<') >= 0 or link.find('>') >= 0: continue link = 'http://%s.pdf' % link.replace('%2f', '/') lock.acquire() self.url_queue.append(link) #self.url_queue.append('http://' + urllib.parse.urlparse(link).netloc) link_num += 1 lock.release() soup = BeautifulSoup(page, 'html.parser') for a in soup.findAll('a',href=True): link = a['href'] if link != '' and link.startswith('/'): link = get_site(ori_url) + link #print('link\t%s\t%s' % (ori_url, link)) if not self.check_link(link): continue lock.acquire() if len(self.url_queue) < self.queue_max_size: self.url_queue.append(link) link_num += 1 else: print('warning: queue size exceed', file = sys.stderr) lock.release() return link_num
def get_dis(cont): r1 = str_util.cut_windows(cont, '<title>', '的症状') if len(r1) > 0: return r1[0] return ''
def get_sym(cont): r1 = str_util.cut_windows(cont, 'db f12 lh240 mb15', '</span>') if len(r1) != 1: return [] r2 = str_util.cut_windows(r1[0], '\"_blank\">', '</a>') return r2
import sys, os, time, random sys.path.append('../../../alg/basic') import str_util i = 1 cont = open(sys.argv[1], 'rb').read() retdir = sys.argv[2] for part in str_util.cut_windows(cont, b'page: ', b'\n~EOF!\n'): fp = open('%s/%d.pdf' % (retdir, i), 'wb') fp.write(part) fp.close() i += 1