def main(argv): args = parse_args(argv[1:]) with args.tdir.joinpath("model.xml").open() as file: model_id = args.model.split("+", maxsplit=1)[0] model = BeautifulSoup(file, "xml").find("model", id=model_id) sub_model = model.select_one("subModel") site_model = model.select_one("#sitemodel") operators = model.select_one("operators") prior = model.select_one("prior") log = model.select_one("log") with args.tdir.joinpath( f"{args.clock}-{args.coalescent}.xml").open() as file: soup = BeautifulSoup(file, "xml") # taxa tag_tax, tag_aln = taxa_tags(soup, args.msa, args.dregex, args.dformat) soup.beast.insert(0, tag_tax) soup.beast.insert(1, tag_aln) # model soup.beast.insert(2, sub_model) soup.beast.insert(3, site_model) if model.has_attr("operators"): for ele in list(operators.children): soup.beast.operators.append(ele) if model.has_attr("prior"): for ele in list(prior.children): soup.beast.mcmc.joint.prior.append(ele) if model.has_attr("log"): for ele in list(log.children): soup.select_one("#fileLog").append(ele) if "+G" in args.model: gammaize(soup) if "+I" in args.model: invariantize(soup) # MCMC soup.select_one("mcmc")["chainLength"] = args.len_mcmc soup.select_one("mcmc")["operatorAnalysis"] = args.stem + ".ops" soup.select_one("#fileLog")["logEvery"] = args.echo_mcmc soup.select_one("#fileLog")["fileName"] = args.stem + ".log" soup.select_one("logTree")["logEvery"] = args.echo_mcmc soup.select_one("logTree")["fileName"] = args.stem + ".trees" soup.select_one("#screenLog")["logEvery"] = args.echo # PS/SS psss_tags(soup, args.tdir.joinpath("psss.xml"), **vars(args)) print(soup.prettify()) return 0
def download(self,url,rc = 5,data=None,proxies=None): try: rs = requests.request('GET', url, headers=self.headers,data=data) content = rs.content soup = BeautifulSoup(content,'lxml') if soup.has_attr('href'): if "该页未找到" in soup.title.string : if rc>0: print("找不到网页,重新下载页面:",url,rc) soup = self.download(url, rc - 1) else: print("下载失败:次数用完") soup = None except Timeout as e: print('Downloader download ConnectionError or Timeout:' + str(e)) soup = None if rc > 0: print("超时,重新下载页面:",url,rc) soup = self.download(url, rc - 1) except Exception as e: print('Downloader download Exception:' + str(e)) soup = None if rc > 0: print("错误,重新下载页面:",url,rc) soup = self.download(url, rc - 1) return soup pass
def make_tag(tag0, klass, data, ndp=None, template=None, poset=None): svg = data['svg'] tag_svg = BeautifulSoup(svg, 'lxml', from_encoding='utf-8').svg assert tag_svg.name == 'svg' if tag_svg.has_attr('width'): ws = tag_svg['width'] hs = tag_svg['height'] assert 'pt' in ws w = float(ws.replace('pt','')) h = float(hs.replace('pt','')) scale = MCDPConstants.scale_svg w2 = w * scale h2 = h * scale tag_svg['width'] = w2 tag_svg['height'] = h2 tag_svg['rescaled'] = 'Rescaled from %s %s, scale = %s' % (ws, hs, scale) else: print('no width in SVG tag: %s' % tag_svg) tag_svg['class'] = klass if tag0.has_attr('style'): tag_svg['style'] = tag0['style'] if tag0.has_attr('id'): tag_svg['id'] = tag0['id'] if generate_pdf: pdf0 = data['pdf'] pdf = crop_pdf(pdf0, margins=0) div = Tag(name='div') att = MCDPConstants.ATTR_LOAD_NAME if tag0.has_attr('id'): basename = tag0['id'] elif ndp is not None and hasattr(ndp, att): basename = getattr(ndp, att) elif template is not None and hasattr(template, att): basename = getattr(template, att) elif poset is not None and hasattr(poset, att): basename = getattr(poset, att) else: hashcode = hashlib.sha224(tag0.string).hexdigest()[-8:] basename = 'code-%s' % (hashcode) docname = os.path.splitext(os.path.basename(realpath))[0] download = docname + "." + basename + "." + klass + '.pdf' a = create_a_to_data(download=download, data_format='pdf', data=pdf) a['class'] = 'pdf_data' a.append(NavigableString(download)) div.append(tag_svg) div.append(a) return div else: return tag_svg
def save_contact(self, name, phone) -> bool: """ :param name: :type name: :param phone: :type phone: :return: :rtype: """ from bs4 import BeautifulSoup command = self._build_am_start( { IntentFlags.ACTION: AndroidActionInsert, IntentFlags.MIME_TYPE: AndroidVndContact, IntentFlags.EXTRA_STRING_VALUE: [{ "value": "name", "extra": f"\"{name}\"" }, { "value": "phone", "extra": f"{phone}" }] }, app_distinct=True, app_name="com.android.contacts") self.device.adb_utils.shell(command) # The dump is done to try to get the contact sync notification, and thus be able to click to not sync. xml = self.device.adb_utils.dump_hierarchy() soup = BeautifulSoup(xml) for soup in soup.find_all( "node", {"resource-id": "com.android.contacts:id/text"}): if soup.has_attr("text") and "contacts online" in soup["text"]: # Some devices may already start with the "do not sync" button in focus, so we hit enter. self.device.adb_device.input_keyevent(AndroidKeyEvents.ENTER) self.device.adb_device.input_keyevent(AndroidKeyEvents.TAB) self.device.adb_device.input_keyevent(AndroidKeyEvents.ENTER) # The HOME button is capable of saving the contact, it is worth noting that this is not guaranteed to work correctly # todo: Search for a way to ensure that the contact has been saved or not self.device.adb_device.input_keyevent(AndroidKeyEvents.HOME) # It is necessary to finish the process, otherwise, when we return, # android will try to save the previous contact again. self.device.adb_utils.app_stop("com.android.contacts") return self.contact_exists(phone)
def get_href(cls, element: BeautifulSoup) -> Optional[str]: if element.has_attr("href"): return element["href"] if element.name == "a": a = element elif cls.exists(element.a): a = element.a else: return None return a.get("href")
def scrape_data_from_dom(dom: BeautifulSoup) -> dict: fighters = {} props = {} fighter_ids = [] # CSS class names that define whether the <tr> contains data for a fighter or for a prop css_classes_fighter = [None] css_classes_prop = ['pr', 'pr-odd'] # Loop through the fights fight_tables = dom.find_all("table", class_="odds-table") for fight_table in fight_tables: # Loop through the different fight tables # Extract the header with the names of the sports books sports_books_names = Scraper.extract_sports_book_names_from_dom( fight_table) # Loop through each row in the fight table for dom in fight_table.find("tbody").find_all("tr"): # Make sure this row has <td> elements, and isn't just the "odds-table-responsive-header" if len(dom.find_all("td")) == 0: continue sports_books = Scraper.extract_sports_book_values( dom, sports_books_names) # If the dom element is for fighters, then load the fighter. if not dom.has_attr( 'class' ): # The <tr>s without a class are the fighter rows f = Fighter() f.load_dom(dom, sports_books) fighters[ f. id] = f # Add the Fighter instance to the dict being returned fighter_ids.append(f.id) # If the dom element is for props, then load the props elif [i for i in dom['class'] if i in css_classes_prop ]: # Loop through element's classes p = Prop() # Context note: The website layout has two rows with the `fighter` data, followed by all the props. This is # the reason that the fighters[] and fighter_ids[] lists are expected to be populated at this point p.load_dom(dom, sports_books, fighters[fighter_ids[-1]], fighters[fighter_ids[-2]]) props[ p. id] = p # Add the Prop instance to the dict being returned return {'fighters': fighters, 'props': props}
def find_syns(self, word): if word in self.synTable: return self.synTable[word] word = word.lower() html_doc = requests.get(self.base_uri+word).content t = BeautifulSoup(html_doc) synonyms = [] for el in t.find_all("span", {'class': 'mw-headline'}): if u"Синонимы" in el.text: neededTags = el.find_parent().find_next_sibling().find_all("a") synonyms = [t.text.replace(u"ё", u"е") for t in neededTags if not t.has_attr("class")] if u"править" not in synonyms: answer = synonyms else: answer = [] self.synTable[word] = answer return answer
def populate_staff_users(self, forum_content): user_json = {} user_section = BeautifulSoup(forum_content, "lxml").find( "section", attrs={"id": "discussion-container"}) if user_section and user_section.has_attr("data-roles"): if """ in user_section["data-roles"]: user_json = json.loads( html.unescape(user_section["data-roles"])) else: user_json = json.loads(user_section["data-roles"]) else: user_section = re.search("roles: [^\n]*", forum_content) if user_section: # TODO check ok in this case user_json = json.loads( re.sub(r"roles: (.*),", r"\1", user_section.group())) for user in user_json: self.staff_user += [str(y) for y in user_json[user]]
def loadMp(self,loadfile,id,rc=5): print("缓存进程启动,id:%d"%id) self.id = id while True: url = loadfile.get(True) if url == "kill": print("进程",id,"下载进程结束") return 0 try: print("缓存页面:",url,"队列长度:",loadfile.qsize()) s= time.time() rs = requests.request('GET', url, headers=self.headers) content = rs.content soup = BeautifulSoup(content,'lxml') if soup.has_attr('href'): if "该页未找到" in soup.title.string : if rc>0: print("找不到网页,重新下载页面:",url,rc) soup = self.download(url, rc - 1) else: print("下载失败:次数用完") soup = None except Timeout as e: print('Downloader download ConnectionError or Timeout:' + str(e)) soup = None if rc > 0: print("超时,重新下载页面:",url,rc) soup = self.download(url, rc - 1) except Exception as e: print('Downloader download Exception:' + str(e)) soup = None if rc > 0: print("错误,重新下载页面:",url,rc) soup = self.download(url, rc - 1) if soup: data = self.fr(self,soup) if data: self.datas.put(data) e= time.time() print("缓存页面结束:",url,"耗时:%d S"%(e-s)) pass
def insert_into_db(self, parsedResults, query, resultnum): num = resultnum try: for r in parsedResults: soup = BeautifulSoup(r,from_encoding='utf8').find('div', class_='rb') if soup.has_attr('id'): soup['id'] = 'rb_'+str(num) robj = SearchResult.objects.create(query=query, rank=num, result_id='rb_'+str(num), content=str(soup)) robj.save() num += 1 else: print "THE RESULT IS NOT VALID", resultnum except Exception as e: print "roll back!" print e transaction.rollback() return resultnum else: print "commit success!" transaction.atomic() return num
def process(data_file): # folders that are exported to #file_targets = [] # files that are intended to be exported to said folders data_str = open(data_file, 'r') #data = BeautifulSoup(data_str, "html5lib").contents[0].contents[1].contents[0] data = BeautifulSoup(data_str, "xml").contents[0] # this respects self closing tags data_str.close() if data.has_attr('debug'): debug = data['debug'] == 'True' print("debug:" + str(debug)) raw_folder = None target_folder = None if data.has_attr('raw'): raw_folder = data['raw'] if data.has_attr('target'): target_folder = data['target'] else: raise ValueError('No target specified') if data.has_attr('watermark'): settings.watermark = data['watermark'] print("raw:", raw_folder) print("target:", target_folder) print("watermark:", settings.watermark) target_folders = {} target_folders['galleries'] = {} target_folders['galleries']['root'] = os.path.join(target_folder, 'galleries') target_folders['galleries']['dir'] = set() target_folders['galleries']['files'] = [] target_folders['thumbnails'] = {} target_folders['thumbnails']['root'] = os.path.join( target_folder, 'thumbnails') target_folders['thumbnails']['dir'] = set() target_folders['thumbnails']['files'] = [] target_folders['images'] = {} target_folders['images']['root'] = os.path.join(target_folder, 'images') target_folders['images']['dir'] = set() target_folders['images']['files'] = [] target_folders['gifs'] = {} target_folders['gifs']['root'] = os.path.join(target_folder, 'gifs') target_folders['gifs']['dir'] = set() target_folders['gifs']['files'] = [] target_folders['video'] = {} target_folders['video']['root'] = os.path.join(target_folder, 'video') target_folders['video']['dir'] = set() target_folders['video']['files'] = [] target_folders['music'] = {} target_folders['music']['root'] = os.path.join(target_folder, 'music') target_folders['music']['dir'] = set() target_folders['music']['files'] = [] # process galleries galleries_tags = data.find_all('galleries') for galleries in galleries_tags: for gallery in galleries: if gallery.name != 'gallery': continue images = [] try: name = gallery['name'] except: raise ValueError('No name specified for gallery') print(name) # get images for i in gallery: if i.name != 'image': continue try: src = i['src'] f = os.path.join(raw_folder, src) if image.is_image(f): images.append(f) except: src = None # get images for folder try: folder = os.path.join(raw_folder, gallery['folder']) for f in os.listdir(folder): if image.is_image(f): images.append(os.path.join(folder, f)) except: folder = None # process gallery gallery_target = os.path.join(target_folder, 'galleries', name) target_folders['galleries']['dir'].add(gallery_target) if not os.path.exists(gallery_target): os.makedirs(gallery_target) thumbnail_target = os.path.join(target_folder, 'thumbnails', name) target_folders['thumbnails']['dir'].add(thumbnail_target) if not os.path.exists(thumbnail_target): os.makedirs(thumbnail_target) cnt = 1 for i in images: print(' ', i) basename = os.path.basename(i) filename, ext = os.path.splitext(basename) # process web out_file = os.path.join(gallery_target, name + '-' + str(cnt) + '.jpg') target_folders['galleries']['files'].append(out_file) image.process(i, out_file, ext='jpg', max=(800, 600), quality=95, watermark=settings.watermark) # process thumbnail out_file = os.path.join( thumbnail_target, name + '-' + str(cnt) + '-thumbnail.jpg') target_folders['thumbnails']['files'].append(out_file) image.process(i, out_file, ext='jpg', max=(105, 140), quality=95) cnt += 1 # process images images_tags = data.find_all('images') for images in images_tags: for tag in images: if tag.name != 'file': continue scale = 1.0 try: scale = float(tag['scale']) except: pass try: src = tag['src'] except: raise ValueError('No src specified for image') try: target = tag['target'] except: raise ValueError('No target specified for image') # tag use watermark override cur_watermark = None try: if tag['use_watermark'].lower() == 'true': cur_watermark = settings.watermark elif tag['use_watermark'].lower() == 'false': cur_watermark = None except: pass image_rel, filename = os.path.split(target) # process folder image_folder = os.path.join(target_folder, 'images', image_rel) target_folders['images']['dir'].add(image_folder) if not os.path.exists(image_folder): os.makedirs(image_folder) # process web in_file = os.path.join(raw_folder, src) out_file = os.path.join(image_folder, filename + '.jpg') target_folders['images']['files'].append(out_file) image.process(in_file, out_file, ext='jpg', quality=95, scale=scale, watermark=cur_watermark) print(filename) print(' ' + in_file) print(' ' + out_file) # process videos videos_tags = data.find_all('videos') for videos in videos_tags: # group usewatermark override vid_watermark = settings.watermark try: if videos['use_watermark'].lower() == 'false': vid_watermark = None except: pass for tag in videos: if tag.name != 'file': continue try: src = tag['src'] except: raise ValueError('No src specified for video') try: target = tag['target'] except: raise ValueError('No target specified for video') # tag use watermark override cur_watermark = vid_watermark try: if tag['use_watermark'].lower() == 'true': cur_watermark = settings.watermark elif tag['use_watermark'].lower() == 'false': cur_watermark = None except: pass video_rel, filename = os.path.split(target) # process folder video_folder = os.path.join(target_folder, 'video', video_rel) target_folders['video']['dir'].add(video_folder) if not os.path.exists(video_folder): os.makedirs(video_folder) in_file = os.path.join(raw_folder, src) out_file = os.path.join(video_folder, filename + '.mp4') out_image = os.path.join(video_folder, filename + '.jpg') target_folders['video']['files'].append(out_file) target_folders['video']['files'].append(out_image) video.process(in_file, out_file, out_image, watermark=cur_watermark) print(filename) print(' ' + in_file) print(' ' + out_file) print(' ' + out_image) # process gifs gifs_tags = data.find_all('gifs') for gifs in gifs_tags: for tag in gifs: if tag.name != 'file': continue try: src = tag['src'] except: raise ValueError('No src specified for gif') try: target = tag['target'] except: raise ValueError('No target specified for gif') try: scale = tag['scale'] except: scale = None try: crop_x = tag['crop_x'] except: crop_x = None try: crop_y = tag['crop_y'] except: crop_y = None try: crop_w = tag['crop_w'] except: crop_w = None try: crop_h = tag['crop_h'] except: crop_h = None try: start_time = tag['start_time'] except: start_time = None try: end_time = tag['end_time'] except: end_time = None gif_rel, filename = os.path.split(target) # process folder gif_folder = os.path.join(target_folder, 'gifs', gif_rel) target_folders['gifs']['dir'].add(gif_folder) if not os.path.exists(gif_folder): os.makedirs(gif_folder) # process web in_file = os.path.join(raw_folder, src) out_file = os.path.join(gif_folder, filename + '.gif') target_folders['gifs']['files'].append(out_file) gif.process(in_file, out_file, scale=scale, crop_x=crop_x, crop_y=crop_y, crop_w=crop_w, crop_h=crop_h, start_time=start_time, end_time=end_time) print(filename) print(' ' + in_file) print(' ' + out_file) # process music musics_tags = data.find_all('music') for music in musics_tags: for tag in music: if tag.name != 'file': continue try: src = tag['src'] except: raise ValueError('No src specified for music') try: target = tag['target'] except: raise ValueError('No target specified for music') music_rel, filename = os.path.split(target) # process folder music_folder = os.path.join(target_folder, 'music', music_rel) target_folders['music']['dir'].add(music_folder) if not os.path.exists(music_folder): os.makedirs(music_folder) # process web in_file = os.path.join(raw_folder, src) out_file = os.path.join(music_folder, filename + '.mp3') target_folders['music']['files'].append(out_file) audio.process(in_file, out_file) print(filename) print(' ' + in_file) print(' ' + out_file) if settings.delete: utilities.purify(target_folders)
def start(): with open(ENTRY_POINT_FILENAME, 'r') as entry_point_file: table = BeautifulSoup(entry_point_file, 'lxml') links = table.find_all('a') for link in links: href = link['href'] href += '/by_Degree' college_page_link = BASE_URL + href college_page_html = urlopen(college_page_link).read() college_page_soup = BeautifulSoup(college_page_html, 'lxml') college = href.split('/')[3].replace('School=', '') for table in college_page_soup.find_all('table'): if not (table.has_attr('class') and 'tlf' in table['class'] \ and 'f11' in table['class'] and 'w585' in table['class']): continue rows = table.find_all('tr') for row in rows: if row.has_attr('class') or not row.th or not row.td: continue # man im tired try: degree = row.th.a.string salary = row.td.string except: continue # may be in a range '$x - $y' so take average # also remove $ tokens = salary.split() if len(tokens) == 3: min = int(tokens[0][1:].replace(',', '')) max = int(tokens[2][1:].replace(',', '')) salary = (min + max) / 2 else: salary = int(tokens[0][1:].replace(',', '')) salary = int(salary) # make db insertions degree_id = -1 college_id = -1 try: degree = Degree(degree) db.session.add(degree) db.session.commit() degree_id = degree.id except: db.session.rollback() degree = Degree.query.filter_by(name=degree.name).first() if not degree: continue degree_id = degree.id college_id = find_college_id(college) if college_id == -1: continue try: college_degree_salary = CollegeDegreeSalary(degree_id, college_id, salary) db.session.add(college_degree_salary) db.session.commit() except: continue
def scrapeGame(game): currentRow = "" game = Soup(game, 'lxml') if game.has_attr("class"): return "" try: week = game.find('th').text score = game.find('td', {'data-stat': 'boxscore_word'}).find('a') if (score == None): return "" scoreUrl = score.get('href') cbsUrl = 'https://www.pro-football-reference.com' + scoreUrl req = requests.get(cbsUrl) soup = Soup(req.text, 'lxml') count = 0 scorebox = soup.find('div', {'class': 'scorebox'}) home, away = [ x.text for x in scorebox.find_all('a', {'itemprop': 'name'}) ] home_div = soup.find('div', {'id': 'all_home_starters'}) homePlayerTable = home_div.find_all( text=lambda text: isinstance(text, Comment)) positionMap = { 'QB': " ", 'LT': " ", 'LG': " ", 'C': " ", "RG": " ", "RT": " " } set = False rbs = [] wrs = [] tes = [] lbs = [] des = [] dts = [] cbs = [] s = [] currentRow += str(week) + "," + home + "," for elem in homePlayerTable: elms = Soup(elem, 'lxml').find('table', {'id': 'home_starters'}) if elms != None: players = elms.find_all('tr') for guy in players: name = guy.find('th') pos = guy.find('td') if pos != None and name != None: if ('RB' in pos.text or 'HB' in pos.text or 'FB' in pos.text): rbs.append(name.text) elif ('LB' in pos.text or 'WILL' in pos.text or 'MIKE' in pos.text): lbs.append(name.text) elif ('WR' in pos.text): wrs.append(name.text) elif ('TE' in pos.text): tes.append(name.text) elif ('DE' in pos.text or 'DL' in pos.text): des.append(name.text) elif ('DT' in pos.text or 'NT' in pos.text): dts.append(name.text) elif ('CB' in pos.text or 'DB' in pos.text): cbs.append(name.text) elif ('S' in pos.text): s.append(name.text) else: positionMap[pos.text] = name.text set = True if (set): currentRow += positionMap['QB'] + "," if (len(rbs) > 0): currentRow += ",".join(rbs) + "," if (len(wrs) > 0): currentRow += ",".join(wrs) + "," if (len(tes) > 0): currentRow += ",".join(tes) + "," currentRow += positionMap['LT'] + "," + positionMap[ 'LG'] + "," + positionMap['C'] + "," + positionMap[ 'RG'] + "," + positionMap['RT'] + "," if (len(des) > 0): currentRow += ",".join(des) + "," if (len(dts) > 0): currentRow += ",".join(dts) + "," if (len(lbs) > 0): currentRow += ",".join(lbs) + "," if (len(cbs) > 0): currentRow += ",".join(cbs) + "," currentRow += ",".join(s) currentRow += '\n' vis_div = soup.find('div', {'id': 'all_vis_starters'}) visPlayerTable = vis_div.find_all( text=lambda text: isinstance(text, Comment)) currentRow += str(week) + "," + away + "," positionMap = { 'QB': " ", 'LT': " ", 'LG': " ", 'C': " ", "RG": " ", "RT": " " } set = False rbs = [] tes = [] lbs = [] wrs = [] des = [] dts = [] cbs = [] s = [] for elem in visPlayerTable: elms = Soup(elem, 'lxml').find('table', {'id': 'vis_starters'}) if elms != None: players = elms.find_all('tr') for guy in players: name = guy.find('th') pos = guy.find('td') if pos != None and name != None: if ('RB' in pos.text or 'HB' in pos.text or 'FB' in pos.text): rbs.append(name.text) elif ('LB' in pos.text or 'WILL' in pos.text or 'MIKE' in pos.text): lbs.append(name.text) elif ('WR' in pos.text): wrs.append(name.text) elif ('TE' in pos.text): tes.append(name.text) elif ('DE' in pos.text or 'DL' in pos.text): des.append(name.text) elif ('DT' in pos.text or 'NT' in pos.text): dts.append(name.text) elif ('CB' in pos.text or 'DB' in pos.text): cbs.append(name.text) elif ('S' in pos.text): s.append(name.text) else: positionMap[pos.text] = name.text set = True if (set): currentRow += positionMap['QB'] + "," if (len(rbs) > 0): currentRow += ",".join(rbs) + "," if (len(wrs) > 0): currentRow += ",".join(wrs) + "," if (len(tes) > 0): currentRow += ",".join(tes) + "," currentRow += positionMap['LT'] + "," + positionMap[ 'LG'] + "," + positionMap['C'] + "," + positionMap[ 'RG'] + "," + positionMap['RT'] + "," if (len(des) > 0): currentRow += ",".join(des) + "," if (len(dts) > 0): currentRow += ",".join(dts) + "," if (len(lbs) > 0): currentRow += ",".join(lbs) + "," if (len(cbs) > 0): currentRow += ",".join(cbs) + "," currentRow += ",".join(s) currentRow += '\n' return currentRow except AttributeError: return ""
# 1. Check all works in Expression, compare updated time against # the last change time of origin. (=> UPDATED if outdated # by timestamp, => DELETE if source work disappears) # 2. Check all works in origin which do not exist in Expression # (=> NEW WORKS) entity = session.query(Expression).filter(Expression.collection_url == cbase).all() for row in entity: srcwork = session.query(WikimediaItems).filter(WikimediaItems.title == row.source_id).first() if not srcwork: session.delete(row) session.commit() elif srcwork.updated_date > row.updated_date: soup = BeautifulSoup(srcwork.artist) artist = soup.get_text() link = BeautifulSoup(srcwork.artist, parseOnlyThese=SoupStrainer('a')) if link.has_attr('href'): artisturl = link['href'] else: artisturl = none soup = BeautifulSoup(srcwork.imagedescription) desc = soup.get_text() license = valid_license(srcwork.license_url) if (not license): log.debug("Invalid license: %s" % srcwork.license_url) continue session.query(Expression).filter(Expression.id == row.id).update( { Expression.title: srcwork.title, Expression.description: desc, Expression.rights_statement: license, Expression.credit: artist, Expression.credit_url: artisturl })
site_model = model.select_one("#sitemodel") operators = model.select_one("operators") prior = model.select_one("prior") log = model.select_one("log") with PATH_TEMPLATES.joinpath( f"{params.clock}-{params.coal}.xml").open() as stream: soup = BeautifulSoup(stream, "xml") # taxa tag_tax, tag_aln = taxa_tags(soup, str(snakemake.input.fas)) soup.beast.insert(0, tag_tax) soup.beast.insert(1, tag_aln) # model soup.beast.insert(2, sub_model) soup.beast.insert(3, site_model) if model.has_attr("operators"): for ele in list(operators.children): soup.beast.operators.append(ele) if model.has_attr("prior"): for ele in list(prior.children): soup.beast.mcmc.joint.prior.append(ele) if model.has_attr("log"): for ele in list(log.children): soup.select_one("#fileLog").append(ele) if "+G" in model_bic: gammaize(soup) if "+I" in model_bic: invariantize(soup) # MCMC soup.select_one("mcmc")["chainLength"] = params.mcmc_len soup.select_one("mcmc")["operatorAnalysis"] = params.stem + ".ops"
def findFirst(link): """ Recieves link to Wikipedia webpage Finds and returns string representaing first link in Webpage """ #gets to the content part of the html where we'll find link info = BeautifulSoup(requests.get(link).text, 'html.parser') info = info.body.find(id="content").find(id="bodyContent") info = info.find(id="mw-content-text").div info = info.next #check to cycle through content till we find first lnk while (info == '\n' or info.has_attr('class') or info.name == 'style'): info = info.next_sibling """ prints used of debugging print(link) print('temp') print(temp) print('temp.a') print(temp.a) print('\nhref') print(temp.a['href']) """ #Fix problem if first paragraph has no link while True: try: newLink = base + info.a['href'] break except TypeError: info = info.next_sibling while (info == '\n' or info.has_attr('class') or info.name == 'style'): info = info.next_sibling break #Fixed problem with coordinates Coordinates while True: if newLink == "https://en.wikipedia.org/wiki/Geographic_coordinate_system": info = info.next_sibling while (info == '\n' or info.has_attr('class') or info.name == 'style'): info = info.next_sibling newLink = base + info.a['href'] else: break #Fix problem with Citations while True: if info.a.text == "[1]": info = info.next_sibling while (info == '\n' or info.has_attr('class') or info.name == 'style'): info = info.next_sibling else: break newLink = base + info.a['href'] #print("\t\t" + newLink) return newLink[24:]
def sub(_form): # Imports from bs4 import BeautifulSoup from django.template.defaultfilters import safe from smmaranim.custom_settings import ERROR_MESSAGES from smmaranim.custom_settings import MAY_BE_REQUIRED_FIELD from smmaranim.custom_settings import REQUIRED_FIELD output = {} # Mise en forme du gabarit par défaut gabarit_defaut = ''' <div class="field-wrapper" id="fw_{}"> <span class="field-label">{}</span> <span class="field">{}</span> <span class="field-error-message"></span> </div> ''' for champ in _form: # Surchargement des messages d'erreur for cle, val in ERROR_MESSAGES.items(): _form.fields[champ.name].error_messages[cle] = val # Conversion du champ en code HTML (<=> chaîne de caractères) champ__str = BeautifulSoup('{}'.format(champ), 'html.parser') # Ajout d'une note à la fin du label de chaque champ obligatoire if champ.label: strs = champ.label.split('|') if _form.fields[champ.name].required == True: strs[0] += REQUIRED_FIELD else: for elem in champ__str.find_all(): if 'may-be-required' in elem.attrs.keys(): strs[0] += MAY_BE_REQUIRED_FIELD if champ.help_text: strs[0] += '<span class="help-icon" title="{}"></span>'.format( champ.help_text) champ.label = '|'.join(strs) # Définition de la valeur de l'attribut name attr_name = '{}-{}'.format(_form.prefix, champ.name) if _form.prefix else champ.name # Suppression de l'attribut required for elem in champ__str.find_all(): if 'may-be-required' in elem.attrs.keys(): del elem['may-be-required'] if 'required' in elem.attrs.keys(): del elem['required'] # Obtention du type de champ type_champ = champ.field.widget.__class__.__name__ # Définition du gabarit if type_champ == 'CheckboxInput': gabarit = ''' <div class="field-wrapper" id="fw_{}"> <span class="field">{}</span> <span class="field-label">{}</span> <span class="field-error-message"></span> </div> '''.format(attr_name, champ__str, champ.label) elif type_champ == 'ClearableFileInput': # Stockage des inputs de type file et checkbox input_checkbox = champ__str.find('input', {'type': 'checkbox'}) input_file = champ__str.find('input', {'type': 'file'}) # Initialisation du bloc informations infos = '' for a in champ__str.find_all('a'): # Affichage de l'option "Effacer" si définie if input_checkbox: delete = ''' <span class="delete-file"> {} <label for="{}-clear_id">Effacer</label> </span> '''.format(input_checkbox, attr_name) else: delete = '' infos = ''' <div class="if-return"> <span class="file-infos"> {} </span> {} </div> '''.format(a['href'], delete) gabarit = ''' <div class="field-wrapper" id="fw_{}"> <span class="field-label">{}</span> <div class="if-container"> <span class="field">{}</span> <span class="if-trigger">Parcourir</span> {} </div> <span class="field-error-message"></span> </div> '''.format(attr_name, champ.label, input_file, infos) elif type_champ == 'DateInput': gabarit = ''' <div class="field-wrapper" id="fw_{}"> <span class="field-label">{}</span> <div class="form-group"> <span class="field"> <div class="input-group"> {} <span class="date input-group-addon" style="cursor: pointer;"> <input name="{}__datepicker" type="hidden"> <span class="glyphicon glyphicon-calendar"></span> </span> </div> </span> </div> <span class="field-error-message"></span> </div> '''.format(attr_name, champ.label, champ__str, attr_name) elif type_champ == 'DateTimeInput': gabarit = gabarit_defaut.format(attr_name, champ.label, champ__str) elif type_champ == 'EmailInput': # Obtention de la balise <input/> de type email champ__str = champ__str.find('input', {'type': 'email'}) # Changement de type (email -> text) champ__str['type'] = 'text' gabarit = ''' <div class="field-wrapper" id="fw_{}"> <span class="field-label">{}</span> <div class="form-group"> <span class="field"> <div class="input-group"> {} <span class="input-group-addon"> <span class="fa fa-at"></span> </span> </div> </span> </div> <span class="field-error-message"></span> </div> '''.format(attr_name, champ.label, champ__str, attr_name) elif type_champ == 'NumberInput': # Obtention de la balise <input/> de type number champ__str = champ__str.find('input', {'type': 'number'}) # Changement de type (number -> text) champ__str['type'] = 'text' # Suppression d'attributs indésirables for ta in ['min']: if champ__str.has_attr(ta): del champ__str[ta] gabarit = gabarit_defaut.format(attr_name, champ.label, champ__str) elif type_champ == 'PasswordInput': gabarit = gabarit_defaut.format(attr_name, champ.label, champ__str) elif type_champ == 'RadioSelect': # Détermination du type de RadioSelect dtable = True for i in champ__str.find_all('input'): if not i.has_attr('into-datatable'): dtable = False # Détermination du gabarit if dtable == False: gabarit = gabarit_defaut.format(attr_name, champ.label, champ__str) else: # Stockage des labels labels = champ.label.split('|') # Initialisation des balises <tr/> de la balise <tbody/> trs = [] for li in champ__str.find_all('li'): # Obtention de l'élément label (contient les données d'une balise <tr/>) label = li.find('label') # Obtention de l'élément input i = label.find('input') # Suppression de l'attribut into-datatable (inutile) del i['into-datatable'] # Empilement des balises <tr/> if i['value']: trs.append('<tr>{}</tr>'.format(''.join([ '<td>{}</td>'.format( elem if elem != '__rb__' else i) for elem in label.text.split('|') ]))) gabarit = ''' <div class="field-wrapper" id="fw_{}"> <span class="field-label">{}</span> <div class="custom-table" id="dtable_{}"> <table border="1" bordercolor="#DDD"> <thead> <tr>{}</tr> </thead> <tbody>{}</tbody> </table> </div> <span class="field-error-message"></span> </div> '''.format( attr_name, labels[0], attr_name, ''.join([ '<th>{}</th>'.format(elem if elem != '__rb__' else '') for elem in labels[1:] ]), ''.join(trs)) elif type_champ == 'Select': gabarit = gabarit_defaut.format(attr_name, champ.label, champ__str) elif type_champ == 'SelectMultiple': # Stockage des labels labels = champ.label.split('|') # Initialisation des balises <tr/> de la balise <tbody/> trs = [] for option in champ__str.find_all('option'): tds = [] for index, elem in enumerate(option.text.split('|')): td_content = elem if elem == '__zcc__': kwargs = { 'id': 'id_{}_{}'.format(attr_name, index), 'name': attr_name, 'type': 'checkbox', 'value': option['value'] } if option.has_attr('selected'): kwargs['checked'] = True td_content = '<input {}>'.format(' '.join([ '{}="{}"'.format(cle, val) for cle, val in kwargs.items() ])) tds.append('<td>{}</td>'.format(td_content)) trs.append('<tr>{}</tr>'.format(''.join(tds))) gabarit = ''' <div class="field-wrapper" id="fw_{}"> <span class="field-label">{}</span> <div class="custom-table" id="dtable_{}"> <table border="1" bordercolor="#DDD"> <thead> <tr>{}</tr> </thead> <tbody>{}</tbody> </table> </div> <span class="field-error-message"></span> </div> '''.format( attr_name, labels[0], attr_name, ''.join(['<th>{}</th>'.format( elem if elem != '__zcc__' else '<input type="checkbox" id="id_{}__all" value="__ALL__">' \ .format(attr_name) ) for elem in labels[1:]]), ''.join(trs) ) elif type_champ == 'Textarea': gabarit = gabarit_defaut.format(attr_name, champ.label, champ__str) elif type_champ == 'TextInput': gabarit = gabarit_defaut.format(attr_name, champ.label, champ__str) elif type_champ == 'TimeInput': gabarit = gabarit_defaut.format(attr_name, champ.label, champ__str) else: gabarit = None # Empilement du tableau des champs sauf si aucun gabarit disponible if gabarit: output[champ.name] = safe(gabarit) else: raise ValueError( 'Aucun gabarit n\'est disponible pour un champ {}.'.format( type_champ)) return output