Python BeautifulSoup.has_attr Examples, bs4.BeautifulSoup.has_attr Python Examples

Example #1

0

Show file

def main(argv):
    args = parse_args(argv[1:])

    with args.tdir.joinpath("model.xml").open() as file:
        model_id = args.model.split("+", maxsplit=1)[0]
        model = BeautifulSoup(file, "xml").find("model", id=model_id)
        sub_model = model.select_one("subModel")
        site_model = model.select_one("#sitemodel")
        operators = model.select_one("operators")
        prior = model.select_one("prior")
        log = model.select_one("log")

    with args.tdir.joinpath(
            f"{args.clock}-{args.coalescent}.xml").open() as file:
        soup = BeautifulSoup(file, "xml")
        # taxa
        tag_tax, tag_aln = taxa_tags(soup, args.msa, args.dregex, args.dformat)
        soup.beast.insert(0, tag_tax)
        soup.beast.insert(1, tag_aln)
        # model
        soup.beast.insert(2, sub_model)
        soup.beast.insert(3, site_model)
        if model.has_attr("operators"):
            for ele in list(operators.children):
                soup.beast.operators.append(ele)
        if model.has_attr("prior"):
            for ele in list(prior.children):
                soup.beast.mcmc.joint.prior.append(ele)
        if model.has_attr("log"):
            for ele in list(log.children):
                soup.select_one("#fileLog").append(ele)
        if "+G" in args.model:
            gammaize(soup)
        if "+I" in args.model:
            invariantize(soup)
        # MCMC
        soup.select_one("mcmc")["chainLength"] = args.len_mcmc
        soup.select_one("mcmc")["operatorAnalysis"] = args.stem + ".ops"
        soup.select_one("#fileLog")["logEvery"] = args.echo_mcmc
        soup.select_one("#fileLog")["fileName"] = args.stem + ".log"
        soup.select_one("logTree")["logEvery"] = args.echo_mcmc
        soup.select_one("logTree")["fileName"] = args.stem + ".trees"
        soup.select_one("#screenLog")["logEvery"] = args.echo
        # PS/SS
        psss_tags(soup, args.tdir.joinpath("psss.xml"), **vars(args))

    print(soup.prettify())

    return 0

Example #2

0

Show file

File: Reptile.py Project: bianc2018/Reptile

 def download(self,url,rc = 5,data=None,proxies=None):
     try:
         rs = requests.request('GET', url, headers=self.headers,data=data)
         content = rs.content
         soup = BeautifulSoup(content,'lxml')
         if soup.has_attr('href'):
             if "该页未找到" in soup.title.string :
                 if rc>0:
                     print("找不到网页，重新下载页面:",url,rc)
                     soup = self.download(url, rc - 1)
                 else:
                     print("下载失败：次数用完")
                     soup = None
     except Timeout as e:
         print('Downloader download ConnectionError or Timeout:' + str(e))
         soup = None
         if rc > 0:
             print("超时,重新下载页面:",url,rc)
             soup = self.download(url, rc - 1)
     except Exception as e:
         print('Downloader download Exception:' + str(e))
         soup = None
         if rc > 0:
             print("错误,重新下载页面:",url,rc)
             soup = self.download(url, rc - 1)
     return soup
     pass

Example #3

0

Show file

File: highlight.py Project: rusi/mcdp

    def make_tag(tag0, klass, data, ndp=None, template=None, poset=None):
        svg = data['svg']

        tag_svg = BeautifulSoup(svg, 'lxml', from_encoding='utf-8').svg

        assert tag_svg.name == 'svg'
        if tag_svg.has_attr('width'):
            ws = tag_svg['width']
            hs = tag_svg['height']
            assert 'pt' in ws
            w = float(ws.replace('pt',''))
            h = float(hs.replace('pt',''))
            scale = MCDPConstants.scale_svg

            w2 = w * scale
            h2 = h * scale
            tag_svg['width'] = w2
            tag_svg['height'] = h2
            tag_svg['rescaled'] = 'Rescaled from %s %s, scale = %s' % (ws, hs, scale)
        else:
            print('no width in SVG tag: %s' % tag_svg)

        tag_svg['class'] = klass

        if tag0.has_attr('style'):
            tag_svg['style'] = tag0['style']
        if tag0.has_attr('id'):
            tag_svg['id'] = tag0['id']

        if generate_pdf:
            pdf0 = data['pdf']
            pdf = crop_pdf(pdf0, margins=0)

            div = Tag(name='div')

            att = MCDPConstants.ATTR_LOAD_NAME
            if tag0.has_attr('id'):
                basename = tag0['id']
            elif ndp is not None and hasattr(ndp, att):
                basename = getattr(ndp, att)
            elif template is not None and hasattr(template, att):
                basename = getattr(template, att)
            elif poset is not None and hasattr(poset, att):
                basename = getattr(poset, att)
            else:
                hashcode = hashlib.sha224(tag0.string).hexdigest()[-8:]
                basename = 'code-%s' % (hashcode)

            docname = os.path.splitext(os.path.basename(realpath))[0]
            download = docname + "." + basename + "." + klass + '.pdf'
            a = create_a_to_data(download=download, data_format='pdf', data=pdf)
            a['class'] = 'pdf_data'
            a.append(NavigableString(download))
            div.append(tag_svg)
            div.append(a)
            return div
        else:
            return tag_svg

Example #4

0

Show file

    def save_contact(self, name, phone) -> bool:
        """

        :param name:
        :type name:
        :param phone:
        :type phone:
        :return:
        :rtype:
        """
        from bs4 import BeautifulSoup

        command = self._build_am_start(
            {
                IntentFlags.ACTION:
                AndroidActionInsert,
                IntentFlags.MIME_TYPE:
                AndroidVndContact,
                IntentFlags.EXTRA_STRING_VALUE: [{
                    "value": "name",
                    "extra": f"\"{name}\""
                }, {
                    "value": "phone",
                    "extra": f"{phone}"
                }]
            },
            app_distinct=True,
            app_name="com.android.contacts")

        self.device.adb_utils.shell(command)

        # The dump is done to try to get the contact sync notification, and thus be able to click to not sync.
        xml = self.device.adb_utils.dump_hierarchy()
        soup = BeautifulSoup(xml)

        for soup in soup.find_all(
                "node", {"resource-id": "com.android.contacts:id/text"}):

            if soup.has_attr("text") and "contacts online" in soup["text"]:
                #  Some devices may already start with the "do not sync" button in focus, so we hit enter.
                self.device.adb_device.input_keyevent(AndroidKeyEvents.ENTER)
                self.device.adb_device.input_keyevent(AndroidKeyEvents.TAB)
                self.device.adb_device.input_keyevent(AndroidKeyEvents.ENTER)

        #  The HOME button is capable of saving the contact, it is worth noting that this is not guaranteed to work correctly
        #  todo: Search for a way to ensure that the contact has been saved or not
        self.device.adb_device.input_keyevent(AndroidKeyEvents.HOME)

        # It is necessary to finish the process, otherwise, when we return,
        # android will try to save the previous contact again.
        self.device.adb_utils.app_stop("com.android.contacts")

        return self.contact_exists(phone)

Example #5

0

Show file

    def get_href(cls, element: BeautifulSoup) -> Optional[str]:
        if element.has_attr("href"):
            return element["href"]

        if element.name == "a":
            a = element
        elif cls.exists(element.a):
            a = element.a
        else:
            return None

        return a.get("href")

Example #6

0

Show file

File: scraper.py Project: robtoyota/bfo_scraper

    def scrape_data_from_dom(dom: BeautifulSoup) -> dict:
        fighters = {}
        props = {}
        fighter_ids = []

        # CSS class names that define whether the <tr> contains data for a fighter or for a prop
        css_classes_fighter = [None]
        css_classes_prop = ['pr', 'pr-odd']

        # Loop through the fights
        fight_tables = dom.find_all("table", class_="odds-table")
        for fight_table in fight_tables:  # Loop through the different fight tables
            # Extract the header with the names of the sports books
            sports_books_names = Scraper.extract_sports_book_names_from_dom(
                fight_table)

            # Loop through each row in the fight table
            for dom in fight_table.find("tbody").find_all("tr"):
                # Make sure this row has <td> elements, and isn't just the "odds-table-responsive-header"
                if len(dom.find_all("td")) == 0:
                    continue

                sports_books = Scraper.extract_sports_book_values(
                    dom, sports_books_names)

                # If the dom element is for fighters, then load the fighter.
                if not dom.has_attr(
                        'class'
                ):  # The <tr>s without a class are the fighter rows
                    f = Fighter()
                    f.load_dom(dom, sports_books)
                    fighters[
                        f.
                        id] = f  # Add the Fighter instance to the dict being returned
                    fighter_ids.append(f.id)

                # If the dom element is for props, then load the props
                elif [i for i in dom['class'] if i in css_classes_prop
                      ]:  # Loop through element's classes
                    p = Prop()
                    # Context note: The website layout has two rows with the `fighter` data, followed by all the props. This is
                    # the reason that the fighters[] and fighter_ids[] lists are expected to be populated at this point
                    p.load_dom(dom, sports_books, fighters[fighter_ids[-1]],
                               fighters[fighter_ids[-2]])
                    props[
                        p.
                        id] = p  # Add the Prop instance to the dict being returned

        return {'fighters': fighters, 'props': props}

Example #7

0

Show file

File: stego_utils.py Project: briskly/linguistic-stego

 def find_syns(self, word):
     if word in self.synTable:
         return self.synTable[word]
     word = word.lower()
     html_doc = requests.get(self.base_uri+word).content
     t = BeautifulSoup(html_doc)
     synonyms = []
     for el in t.find_all("span", {'class': 'mw-headline'}):
         if u"Синонимы" in el.text:
             neededTags = el.find_parent().find_next_sibling().find_all("a")
             synonyms = [t.text.replace(u"ё", u"е") for t in neededTags if not t.has_attr("class")]
     if u"править" not in synonyms:
         answer = synonyms
     else:
         answer = []
     self.synTable[word] = answer
     return answer

Example #8

0

Show file

File: annex.py Project: SambaashSG/openedx

 def populate_staff_users(self, forum_content):
     user_json = {}
     user_section = BeautifulSoup(forum_content, "lxml").find(
         "section", attrs={"id": "discussion-container"})
     if user_section and user_section.has_attr("data-roles"):
         if "&#34;" in user_section["data-roles"]:
             user_json = json.loads(
                 html.unescape(user_section["data-roles"]))
         else:
             user_json = json.loads(user_section["data-roles"])
     else:
         user_section = re.search("roles: [^\n]*", forum_content)
         if user_section:  # TODO check ok in this case
             user_json = json.loads(
                 re.sub(r"roles: (.*),", r"\1", user_section.group()))
     for user in user_json:
         self.staff_user += [str(y) for y in user_json[user]]

Example #9

0

Show file

File: Reptile.py Project: bianc2018/Reptile

 def loadMp(self,loadfile,id,rc=5):
     print("缓存进程启动,id:%d"%id)
     self.id = id
     while True:
         url = loadfile.get(True)
         if url == "kill":
             print("进程",id,"下载进程结束")
             return 0
         try:
             print("缓存页面:",url,"队列长度:",loadfile.qsize())
             s= time.time()
             rs = requests.request('GET', url, headers=self.headers)
             content = rs.content
             soup = BeautifulSoup(content,'lxml')
             if soup.has_attr('href'):
                 if "该页未找到" in soup.title.string :
                     if rc>0:
                         print("找不到网页，重新下载页面:",url,rc)
                         soup = self.download(url, rc - 1)
                     else:
                         print("下载失败：次数用完")
                         soup = None
         except Timeout as e:
             print('Downloader download ConnectionError or Timeout:' + str(e))
             soup = None
             if rc > 0:
                 print("超时,重新下载页面:",url,rc)
                 soup = self.download(url, rc - 1)
         except Exception as e:
             print('Downloader download Exception:' + str(e))
             soup = None
             if rc > 0:
                 print("错误,重新下载页面:",url,rc)
                 soup = self.download(url, rc - 1)
         if soup:
             data = self.fr(self,soup)
             if data:
                 self.datas.put(data)
                 e= time.time()
                 print("缓存页面结束:",url,"耗时:%d S"%(e-s))
         pass

Example #10

0

Show file

File: SearchResultHub.py Project: lixue12/time_search

 def insert_into_db(self, parsedResults, query, resultnum):
     num = resultnum
     try:
         for r in parsedResults:
             soup = BeautifulSoup(r,from_encoding='utf8').find('div', class_='rb')
             if soup.has_attr('id'):
                 soup['id'] = 'rb_'+str(num)
                 robj = SearchResult.objects.create(query=query,
                                                    rank=num,
                                                    result_id='rb_'+str(num),
                                                    content=str(soup))
                 robj.save()
                 num += 1
             else:
                 print "THE RESULT IS NOT VALID", resultnum
     except Exception as e:
         print "roll back!"
         print e
         transaction.rollback()
         return resultnum
     else:
         print "commit success!"
         transaction.atomic()
         return num

Example #11

0

Show file

def process(data_file):

    # folders that are exported to
    #file_targets = [] # files that are intended to be exported to said folders

    data_str = open(data_file, 'r')
    #data = BeautifulSoup(data_str, "html5lib").contents[0].contents[1].contents[0]
    data = BeautifulSoup(data_str,
                         "xml").contents[0]  # this respects self closing tags
    data_str.close()

    if data.has_attr('debug'):
        debug = data['debug'] == 'True'
        print("debug:" + str(debug))

    raw_folder = None
    target_folder = None

    if data.has_attr('raw'):
        raw_folder = data['raw']

    if data.has_attr('target'):
        target_folder = data['target']
    else:
        raise ValueError('No target specified')

    if data.has_attr('watermark'):
        settings.watermark = data['watermark']

    print("raw:", raw_folder)
    print("target:", target_folder)
    print("watermark:", settings.watermark)

    target_folders = {}

    target_folders['galleries'] = {}
    target_folders['galleries']['root'] = os.path.join(target_folder,
                                                       'galleries')
    target_folders['galleries']['dir'] = set()
    target_folders['galleries']['files'] = []

    target_folders['thumbnails'] = {}
    target_folders['thumbnails']['root'] = os.path.join(
        target_folder, 'thumbnails')
    target_folders['thumbnails']['dir'] = set()
    target_folders['thumbnails']['files'] = []

    target_folders['images'] = {}
    target_folders['images']['root'] = os.path.join(target_folder, 'images')
    target_folders['images']['dir'] = set()
    target_folders['images']['files'] = []

    target_folders['gifs'] = {}
    target_folders['gifs']['root'] = os.path.join(target_folder, 'gifs')
    target_folders['gifs']['dir'] = set()
    target_folders['gifs']['files'] = []

    target_folders['video'] = {}
    target_folders['video']['root'] = os.path.join(target_folder, 'video')
    target_folders['video']['dir'] = set()
    target_folders['video']['files'] = []

    target_folders['music'] = {}
    target_folders['music']['root'] = os.path.join(target_folder, 'music')
    target_folders['music']['dir'] = set()
    target_folders['music']['files'] = []

    # process galleries
    galleries_tags = data.find_all('galleries')
    for galleries in galleries_tags:
        for gallery in galleries:
            if gallery.name != 'gallery':
                continue

            images = []

            try:
                name = gallery['name']
            except:
                raise ValueError('No name specified for gallery')

            print(name)

            # get images
            for i in gallery:
                if i.name != 'image':
                    continue

                try:
                    src = i['src']
                    f = os.path.join(raw_folder, src)
                    if image.is_image(f):
                        images.append(f)
                except:
                    src = None

            # get images for folder
            try:
                folder = os.path.join(raw_folder, gallery['folder'])
                for f in os.listdir(folder):
                    if image.is_image(f):
                        images.append(os.path.join(folder, f))
            except:
                folder = None

            # process gallery
            gallery_target = os.path.join(target_folder, 'galleries', name)
            target_folders['galleries']['dir'].add(gallery_target)
            if not os.path.exists(gallery_target):
                os.makedirs(gallery_target)

            thumbnail_target = os.path.join(target_folder, 'thumbnails', name)
            target_folders['thumbnails']['dir'].add(thumbnail_target)
            if not os.path.exists(thumbnail_target):
                os.makedirs(thumbnail_target)

            cnt = 1
            for i in images:
                print('  ', i)

                basename = os.path.basename(i)
                filename, ext = os.path.splitext(basename)

                # process web
                out_file = os.path.join(gallery_target,
                                        name + '-' + str(cnt) + '.jpg')
                target_folders['galleries']['files'].append(out_file)

                image.process(i,
                              out_file,
                              ext='jpg',
                              max=(800, 600),
                              quality=95,
                              watermark=settings.watermark)

                # process thumbnail
                out_file = os.path.join(
                    thumbnail_target, name + '-' + str(cnt) + '-thumbnail.jpg')
                target_folders['thumbnails']['files'].append(out_file)
                image.process(i,
                              out_file,
                              ext='jpg',
                              max=(105, 140),
                              quality=95)

                cnt += 1

    # process images
    images_tags = data.find_all('images')
    for images in images_tags:
        for tag in images:
            if tag.name != 'file':
                continue

            scale = 1.0
            try:
                scale = float(tag['scale'])
            except:
                pass

            try:
                src = tag['src']
            except:
                raise ValueError('No src specified for image')

            try:
                target = tag['target']
            except:
                raise ValueError('No target specified for image')

            # tag use watermark override
            cur_watermark = None
            try:
                if tag['use_watermark'].lower() == 'true':
                    cur_watermark = settings.watermark
                elif tag['use_watermark'].lower() == 'false':
                    cur_watermark = None

            except:
                pass

            image_rel, filename = os.path.split(target)

            # process folder
            image_folder = os.path.join(target_folder, 'images', image_rel)
            target_folders['images']['dir'].add(image_folder)
            if not os.path.exists(image_folder):
                os.makedirs(image_folder)

            # process web
            in_file = os.path.join(raw_folder, src)
            out_file = os.path.join(image_folder, filename + '.jpg')

            target_folders['images']['files'].append(out_file)

            image.process(in_file,
                          out_file,
                          ext='jpg',
                          quality=95,
                          scale=scale,
                          watermark=cur_watermark)

            print(filename)
            print('  ' + in_file)
            print('  ' + out_file)

    # process videos
    videos_tags = data.find_all('videos')
    for videos in videos_tags:

        # group usewatermark override
        vid_watermark = settings.watermark
        try:
            if videos['use_watermark'].lower() == 'false':
                vid_watermark = None
        except:
            pass

        for tag in videos:
            if tag.name != 'file':
                continue

            try:
                src = tag['src']
            except:
                raise ValueError('No src specified for video')

            try:
                target = tag['target']
            except:
                raise ValueError('No target specified for video')

            # tag use watermark override
            cur_watermark = vid_watermark
            try:
                if tag['use_watermark'].lower() == 'true':
                    cur_watermark = settings.watermark
                elif tag['use_watermark'].lower() == 'false':
                    cur_watermark = None

            except:
                pass

            video_rel, filename = os.path.split(target)

            # process folder
            video_folder = os.path.join(target_folder, 'video', video_rel)
            target_folders['video']['dir'].add(video_folder)
            if not os.path.exists(video_folder):
                os.makedirs(video_folder)

            in_file = os.path.join(raw_folder, src)
            out_file = os.path.join(video_folder, filename + '.mp4')
            out_image = os.path.join(video_folder, filename + '.jpg')

            target_folders['video']['files'].append(out_file)
            target_folders['video']['files'].append(out_image)

            video.process(in_file,
                          out_file,
                          out_image,
                          watermark=cur_watermark)

            print(filename)
            print('  ' + in_file)
            print('  ' + out_file)
            print('  ' + out_image)

    # process gifs
    gifs_tags = data.find_all('gifs')
    for gifs in gifs_tags:
        for tag in gifs:
            if tag.name != 'file':
                continue

            try:
                src = tag['src']
            except:
                raise ValueError('No src specified for gif')

            try:
                target = tag['target']
            except:
                raise ValueError('No target specified for gif')

            try:
                scale = tag['scale']
            except:
                scale = None

            try:
                crop_x = tag['crop_x']
            except:
                crop_x = None

            try:
                crop_y = tag['crop_y']
            except:
                crop_y = None

            try:
                crop_w = tag['crop_w']
            except:
                crop_w = None

            try:
                crop_h = tag['crop_h']
            except:
                crop_h = None

            try:
                start_time = tag['start_time']
            except:
                start_time = None

            try:
                end_time = tag['end_time']
            except:
                end_time = None

            gif_rel, filename = os.path.split(target)

            # process folder
            gif_folder = os.path.join(target_folder, 'gifs', gif_rel)
            target_folders['gifs']['dir'].add(gif_folder)
            if not os.path.exists(gif_folder):
                os.makedirs(gif_folder)

            # process web
            in_file = os.path.join(raw_folder, src)
            out_file = os.path.join(gif_folder, filename + '.gif')

            target_folders['gifs']['files'].append(out_file)

            gif.process(in_file,
                        out_file,
                        scale=scale,
                        crop_x=crop_x,
                        crop_y=crop_y,
                        crop_w=crop_w,
                        crop_h=crop_h,
                        start_time=start_time,
                        end_time=end_time)

            print(filename)
            print('  ' + in_file)
            print('  ' + out_file)

    # process music
    musics_tags = data.find_all('music')
    for music in musics_tags:
        for tag in music:
            if tag.name != 'file':
                continue

            try:
                src = tag['src']
            except:
                raise ValueError('No src specified for music')

            try:
                target = tag['target']
            except:
                raise ValueError('No target specified for music')

            music_rel, filename = os.path.split(target)

            # process folder
            music_folder = os.path.join(target_folder, 'music', music_rel)
            target_folders['music']['dir'].add(music_folder)
            if not os.path.exists(music_folder):
                os.makedirs(music_folder)

            # process web
            in_file = os.path.join(raw_folder, src)
            out_file = os.path.join(music_folder, filename + '.mp3')

            target_folders['music']['files'].append(out_file)

            audio.process(in_file, out_file)

            print(filename)
            print('  ' + in_file)
            print('  ' + out_file)

    if settings.delete:
        utilities.purify(target_folders)

Example #12

0

Show file

def start():
    with open(ENTRY_POINT_FILENAME, 'r') as entry_point_file:
        table = BeautifulSoup(entry_point_file, 'lxml')
        links = table.find_all('a')
        
        for link in links:
            href = link['href']
            href += '/by_Degree'
            college_page_link = BASE_URL + href
            college_page_html = urlopen(college_page_link).read()
            college_page_soup = BeautifulSoup(college_page_html, 'lxml')
            college = href.split('/')[3].replace('School=', '')

            for table in college_page_soup.find_all('table'):
                if not (table.has_attr('class') and 'tlf' in table['class'] \
                and 'f11' in table['class'] and 'w585' in table['class']):
                    continue

                rows = table.find_all('tr')
                for row in rows:
                    if row.has_attr('class') or not row.th or not row.td:
                        continue
                    
                    # man im tired
                    try:
                        degree = row.th.a.string
                        salary = row.td.string
                    except:
                        continue

                    # may be in a range '$x - $y' so take average
                    # also remove $
                    tokens = salary.split()
                    if len(tokens) == 3:
                        min = int(tokens[0][1:].replace(',', ''))
                        max = int(tokens[2][1:].replace(',', ''))
                        salary = (min + max) / 2
                    else:
                        salary = int(tokens[0][1:].replace(',', ''))
                    salary = int(salary)

                    # make db insertions
                    degree_id = -1
                    college_id = -1
                    try:
                        degree = Degree(degree)
                        db.session.add(degree)
                        db.session.commit()
                        degree_id = degree.id
                    except:
                        db.session.rollback()
                        degree = Degree.query.filter_by(name=degree.name).first()
                        if not degree:
                            continue
                        degree_id = degree.id
                    
                    college_id = find_college_id(college)
                    if college_id == -1:
                        continue
    
                    try:
                        college_degree_salary = CollegeDegreeSalary(degree_id, college_id, salary)
                        db.session.add(college_degree_salary)
                        db.session.commit()
                    except:
                        continue

Example #13

0

Show file

File: startingLineupScraper.py Project: higginal/NFL_Starting_Players_Scrape

def scrapeGame(game):

    currentRow = ""

    game = Soup(game, 'lxml')

    if game.has_attr("class"):
        return ""
    try:
        week = game.find('th').text
        score = game.find('td', {'data-stat': 'boxscore_word'}).find('a')

        if (score == None):
            return ""

        scoreUrl = score.get('href')
        cbsUrl = 'https://www.pro-football-reference.com' + scoreUrl

        req = requests.get(cbsUrl)
        soup = Soup(req.text, 'lxml')
        count = 0

        scorebox = soup.find('div', {'class': 'scorebox'})

        home, away = [
            x.text for x in scorebox.find_all('a', {'itemprop': 'name'})
        ]

        home_div = soup.find('div', {'id': 'all_home_starters'})

        homePlayerTable = home_div.find_all(
            text=lambda text: isinstance(text, Comment))

        positionMap = {
            'QB': " ",
            'LT': " ",
            'LG': " ",
            'C': " ",
            "RG": " ",
            "RT": " "
        }
        set = False

        rbs = []
        wrs = []
        tes = []
        lbs = []
        des = []
        dts = []
        cbs = []
        s = []

        currentRow += str(week) + "," + home + ","

        for elem in homePlayerTable:
            elms = Soup(elem, 'lxml').find('table', {'id': 'home_starters'})
            if elms != None:
                players = elms.find_all('tr')
                for guy in players:

                    name = guy.find('th')
                    pos = guy.find('td')

                    if pos != None and name != None:
                        if ('RB' in pos.text or 'HB' in pos.text
                                or 'FB' in pos.text):
                            rbs.append(name.text)
                        elif ('LB' in pos.text or 'WILL' in pos.text
                              or 'MIKE' in pos.text):
                            lbs.append(name.text)
                        elif ('WR' in pos.text):
                            wrs.append(name.text)
                        elif ('TE' in pos.text):
                            tes.append(name.text)
                        elif ('DE' in pos.text or 'DL' in pos.text):
                            des.append(name.text)
                        elif ('DT' in pos.text or 'NT' in pos.text):
                            dts.append(name.text)
                        elif ('CB' in pos.text or 'DB' in pos.text):
                            cbs.append(name.text)
                        elif ('S' in pos.text):
                            s.append(name.text)
                        else:
                            positionMap[pos.text] = name.text
                        set = True

        if (set):
            currentRow += positionMap['QB'] + ","
            if (len(rbs) > 0):
                currentRow += ",".join(rbs) + ","
            if (len(wrs) > 0):
                currentRow += ",".join(wrs) + ","
            if (len(tes) > 0):
                currentRow += ",".join(tes) + ","
            currentRow += positionMap['LT'] + "," + positionMap[
                'LG'] + "," + positionMap['C'] + "," + positionMap[
                    'RG'] + "," + positionMap['RT'] + ","
            if (len(des) > 0):
                currentRow += ",".join(des) + ","
            if (len(dts) > 0):
                currentRow += ",".join(dts) + ","
            if (len(lbs) > 0):
                currentRow += ",".join(lbs) + ","
            if (len(cbs) > 0):
                currentRow += ",".join(cbs) + ","
            currentRow += ",".join(s)
            currentRow += '\n'

        vis_div = soup.find('div', {'id': 'all_vis_starters'})

        visPlayerTable = vis_div.find_all(
            text=lambda text: isinstance(text, Comment))

        currentRow += str(week) + "," + away + ","

        positionMap = {
            'QB': " ",
            'LT': " ",
            'LG': " ",
            'C': " ",
            "RG": " ",
            "RT": " "
        }
        set = False
        rbs = []
        tes = []
        lbs = []
        wrs = []
        des = []
        dts = []
        cbs = []
        s = []

        for elem in visPlayerTable:
            elms = Soup(elem, 'lxml').find('table', {'id': 'vis_starters'})
            if elms != None:
                players = elms.find_all('tr')
                for guy in players:

                    name = guy.find('th')
                    pos = guy.find('td')

                    if pos != None and name != None:
                        if ('RB' in pos.text or 'HB' in pos.text
                                or 'FB' in pos.text):
                            rbs.append(name.text)
                        elif ('LB' in pos.text or 'WILL' in pos.text
                              or 'MIKE' in pos.text):
                            lbs.append(name.text)
                        elif ('WR' in pos.text):
                            wrs.append(name.text)
                        elif ('TE' in pos.text):
                            tes.append(name.text)
                        elif ('DE' in pos.text or 'DL' in pos.text):
                            des.append(name.text)
                        elif ('DT' in pos.text or 'NT' in pos.text):
                            dts.append(name.text)
                        elif ('CB' in pos.text or 'DB' in pos.text):
                            cbs.append(name.text)
                        elif ('S' in pos.text):
                            s.append(name.text)
                        else:
                            positionMap[pos.text] = name.text
                        set = True

        if (set):
            currentRow += positionMap['QB'] + ","
            if (len(rbs) > 0):
                currentRow += ",".join(rbs) + ","
            if (len(wrs) > 0):
                currentRow += ",".join(wrs) + ","
            if (len(tes) > 0):
                currentRow += ",".join(tes) + ","
            currentRow += positionMap['LT'] + "," + positionMap[
                'LG'] + "," + positionMap['C'] + "," + positionMap[
                    'RG'] + "," + positionMap['RT'] + ","
            if (len(des) > 0):
                currentRow += ",".join(des) + ","
            if (len(dts) > 0):
                currentRow += ",".join(dts) + ","
            if (len(lbs) > 0):
                currentRow += ",".join(lbs) + ","
            if (len(cbs) > 0):
                currentRow += ",".join(cbs) + ","
            currentRow += ",".join(s)
            currentRow += '\n'

        return currentRow
    except AttributeError:
        return ""

Example #14

0

Show file

File: washroom.py Project: videorooter/washroom

#   1. Check all works in Expression, compare updated time against
#      the last change time of origin. (=> UPDATED if outdated
#      by timestamp, => DELETE if source work disappears)
#   2. Check all works in origin which do not exist in Expression
#      (=> NEW WORKS)
entity = session.query(Expression).filter(Expression.collection_url == cbase).all()
for row in entity:
  srcwork = session.query(WikimediaItems).filter(WikimediaItems.title == row.source_id).first()
  if not srcwork:
     session.delete(row)
     session.commit()
  elif srcwork.updated_date > row.updated_date:
     soup = BeautifulSoup(srcwork.artist)
     artist = soup.get_text()
     link = BeautifulSoup(srcwork.artist, parseOnlyThese=SoupStrainer('a'))
     if link.has_attr('href'):
        artisturl = link['href']
     else:
        artisturl = none
     soup = BeautifulSoup(srcwork.imagedescription)
     desc = soup.get_text()
     license = valid_license(srcwork.license_url)
     if (not license):
        log.debug("Invalid license: %s" % srcwork.license_url)
        continue
     session.query(Expression).filter(Expression.id == row.id).update(
                { Expression.title: srcwork.title,
                  Expression.description: desc,
                  Expression.rights_statement: license,
                  Expression.credit: artist,
                  Expression.credit_url: artisturl })

Example #15

0

Show file

    site_model = model.select_one("#sitemodel")
    operators = model.select_one("operators")
    prior = model.select_one("prior")
    log = model.select_one("log")

with PATH_TEMPLATES.joinpath(
        f"{params.clock}-{params.coal}.xml").open() as stream:
    soup = BeautifulSoup(stream, "xml")
    # taxa
    tag_tax, tag_aln = taxa_tags(soup, str(snakemake.input.fas))
    soup.beast.insert(0, tag_tax)
    soup.beast.insert(1, tag_aln)
    # model
    soup.beast.insert(2, sub_model)
    soup.beast.insert(3, site_model)
    if model.has_attr("operators"):
        for ele in list(operators.children):
            soup.beast.operators.append(ele)
    if model.has_attr("prior"):
        for ele in list(prior.children):
            soup.beast.mcmc.joint.prior.append(ele)
    if model.has_attr("log"):
        for ele in list(log.children):
            soup.select_one("#fileLog").append(ele)
    if "+G" in model_bic:
        gammaize(soup)
    if "+I" in model_bic:
        invariantize(soup)
    # MCMC
    soup.select_one("mcmc")["chainLength"] = params.mcmc_len
    soup.select_one("mcmc")["operatorAnalysis"] = params.stem + ".ops"

Example #16

0

Show file

File: text_mining.py Project: sd2020spring/TextMining-nabihestefan

def findFirst(link):
    """
    Recieves link to Wikipedia webpage
    Finds and returns string representaing first link in Webpage
    """
    #gets to the content part of the html where we'll find link

    info = BeautifulSoup(requests.get(link).text, 'html.parser')
    info = info.body.find(id="content").find(id="bodyContent")
    info = info.find(id="mw-content-text").div
    info = info.next

    #check to cycle through content till we find first lnk
    while (info == '\n' or info.has_attr('class') or info.name == 'style'):
        info = info.next_sibling
    """
    prints used of debugging
    print(link)
    print('temp')
    print(temp)
    print('temp.a')
    print(temp.a)
    print('\nhref')
    print(temp.a['href'])
    """

    #Fix problem if first paragraph has no link
    while True:
        try:
            newLink = base + info.a['href']
            break
        except TypeError:
            info = info.next_sibling
            while (info == '\n' or info.has_attr('class')
                   or info.name == 'style'):
                info = info.next_sibling
            break

    #Fixed problem with coordinates Coordinates
    while True:
        if newLink == "https://en.wikipedia.org/wiki/Geographic_coordinate_system":
            info = info.next_sibling
            while (info == '\n' or info.has_attr('class')
                   or info.name == 'style'):
                info = info.next_sibling
            newLink = base + info.a['href']
        else:
            break

    #Fix problem with Citations
    while True:
        if info.a.text == "[1]":
            info = info.next_sibling
            while (info == '\n' or info.has_attr('class')
                   or info.name == 'style'):
                info = info.next_sibling
        else:
            break

    newLink = base + info.a['href']
    #print("\t\t" + newLink)
    return newLink[24:]

Example #17

0

Show file

def sub(_form):

    # Imports
    from bs4 import BeautifulSoup
    from django.template.defaultfilters import safe
    from smmaranim.custom_settings import ERROR_MESSAGES
    from smmaranim.custom_settings import MAY_BE_REQUIRED_FIELD
    from smmaranim.custom_settings import REQUIRED_FIELD

    output = {}

    # Mise en forme du gabarit par défaut
    gabarit_defaut = '''
	<div class="field-wrapper" id="fw_{}">
		<span class="field-label">{}</span>
		<span class="field">{}</span>
		<span class="field-error-message"></span>
	</div>
	'''

    for champ in _form:

        # Surchargement des messages d'erreur
        for cle, val in ERROR_MESSAGES.items():
            _form.fields[champ.name].error_messages[cle] = val

        # Conversion du champ en code HTML (<=> chaîne de caractères)
        champ__str = BeautifulSoup('{}'.format(champ), 'html.parser')

        # Ajout d'une note à la fin du label de chaque champ obligatoire
        if champ.label:
            strs = champ.label.split('|')
            if _form.fields[champ.name].required == True:
                strs[0] += REQUIRED_FIELD
            else:
                for elem in champ__str.find_all():
                    if 'may-be-required' in elem.attrs.keys():
                        strs[0] += MAY_BE_REQUIRED_FIELD
            if champ.help_text:
                strs[0] += '<span class="help-icon" title="{}"></span>'.format(
                    champ.help_text)
            champ.label = '|'.join(strs)

        # Définition de la valeur de l'attribut name
        attr_name = '{}-{}'.format(_form.prefix,
                                   champ.name) if _form.prefix else champ.name

        # Suppression de l'attribut required
        for elem in champ__str.find_all():
            if 'may-be-required' in elem.attrs.keys():
                del elem['may-be-required']
            if 'required' in elem.attrs.keys(): del elem['required']

        # Obtention du type de champ
        type_champ = champ.field.widget.__class__.__name__

        # Définition du gabarit
        if type_champ == 'CheckboxInput':
            gabarit = '''
			<div class="field-wrapper" id="fw_{}">
				<span class="field">{}</span>
				<span class="field-label">{}</span>
				<span class="field-error-message"></span>
			</div>
			'''.format(attr_name, champ__str, champ.label)
        elif type_champ == 'ClearableFileInput':

            # Stockage des inputs de type file et checkbox
            input_checkbox = champ__str.find('input', {'type': 'checkbox'})
            input_file = champ__str.find('input', {'type': 'file'})

            # Initialisation du bloc informations
            infos = ''
            for a in champ__str.find_all('a'):

                # Affichage de l'option "Effacer" si définie
                if input_checkbox:
                    delete = '''
					<span class="delete-file">
						{}
						<label for="{}-clear_id">Effacer</label>
					</span>
					'''.format(input_checkbox, attr_name)
                else:
                    delete = ''

                infos = '''
				<div class="if-return">
					<span class="file-infos">
						{}
					</span>
					{}
				</div>
				'''.format(a['href'], delete)

            gabarit = '''
				<div class="field-wrapper" id="fw_{}">
					<span class="field-label">{}</span>
					<div class="if-container">
						<span class="field">{}</span>
						<span class="if-trigger">Parcourir</span>
						{}
					</div>
					<span class="field-error-message"></span>
				</div>
				'''.format(attr_name, champ.label, input_file, infos)
        elif type_champ == 'DateInput':
            gabarit = '''
			<div class="field-wrapper" id="fw_{}">
				<span class="field-label">{}</span>
				<div class="form-group">
					<span class="field">
						<div class="input-group">
							{}
							<span class="date input-group-addon" style="cursor: pointer;">
								<input name="{}__datepicker" type="hidden">
								<span class="glyphicon glyphicon-calendar"></span>
							</span>
						</div>
					</span>
				</div>
				<span class="field-error-message"></span>
			</div>
			'''.format(attr_name, champ.label, champ__str, attr_name)
        elif type_champ == 'DateTimeInput':
            gabarit = gabarit_defaut.format(attr_name, champ.label, champ__str)
        elif type_champ == 'EmailInput':

            # Obtention de la balise <input/> de type email
            champ__str = champ__str.find('input', {'type': 'email'})

            # Changement de type (email -> text)
            champ__str['type'] = 'text'

            gabarit = '''
			<div class="field-wrapper" id="fw_{}">
				<span class="field-label">{}</span>
				<div class="form-group">
					<span class="field">
						<div class="input-group">
							{}
							<span class="input-group-addon">
								<span class="fa fa-at"></span>
							</span>
						</div>
					</span>
				</div>
				<span class="field-error-message"></span>
			</div>
			'''.format(attr_name, champ.label, champ__str, attr_name)
        elif type_champ == 'NumberInput':

            # Obtention de la balise <input/> de type number
            champ__str = champ__str.find('input', {'type': 'number'})

            # Changement de type (number -> text)
            champ__str['type'] = 'text'

            # Suppression d'attributs indésirables
            for ta in ['min']:
                if champ__str.has_attr(ta): del champ__str[ta]

            gabarit = gabarit_defaut.format(attr_name, champ.label, champ__str)
        elif type_champ == 'PasswordInput':
            gabarit = gabarit_defaut.format(attr_name, champ.label, champ__str)
        elif type_champ == 'RadioSelect':

            # Détermination du type de RadioSelect
            dtable = True
            for i in champ__str.find_all('input'):
                if not i.has_attr('into-datatable'): dtable = False

            # Détermination du gabarit
            if dtable == False:
                gabarit = gabarit_defaut.format(attr_name, champ.label,
                                                champ__str)
            else:

                # Stockage des labels
                labels = champ.label.split('|')

                # Initialisation des balises <tr/> de la balise <tbody/>
                trs = []
                for li in champ__str.find_all('li'):

                    # Obtention de l'élément label (contient les données d'une balise <tr/>)
                    label = li.find('label')

                    # Obtention de l'élément input
                    i = label.find('input')

                    # Suppression de l'attribut into-datatable (inutile)
                    del i['into-datatable']

                    # Empilement des balises <tr/>
                    if i['value']:
                        trs.append('<tr>{}</tr>'.format(''.join([
                            '<td>{}</td>'.format(
                                elem if elem != '__rb__' else i)
                            for elem in label.text.split('|')
                        ])))

                gabarit = '''
				<div class="field-wrapper" id="fw_{}">
					<span class="field-label">{}</span>
					<div class="custom-table" id="dtable_{}">
						<table border="1" bordercolor="#DDD">
							<thead>
								<tr>{}</tr>
							</thead>
							<tbody>{}</tbody>
						</table>
					</div>
					<span class="field-error-message"></span>
				</div>
				'''.format(
                    attr_name, labels[0], attr_name, ''.join([
                        '<th>{}</th>'.format(elem if elem != '__rb__' else '')
                        for elem in labels[1:]
                    ]), ''.join(trs))

        elif type_champ == 'Select':
            gabarit = gabarit_defaut.format(attr_name, champ.label, champ__str)
        elif type_champ == 'SelectMultiple':

            # Stockage des labels
            labels = champ.label.split('|')

            # Initialisation des balises <tr/> de la balise <tbody/>
            trs = []
            for option in champ__str.find_all('option'):
                tds = []
                for index, elem in enumerate(option.text.split('|')):
                    td_content = elem
                    if elem == '__zcc__':
                        kwargs = {
                            'id': 'id_{}_{}'.format(attr_name, index),
                            'name': attr_name,
                            'type': 'checkbox',
                            'value': option['value']
                        }
                        if option.has_attr('selected'):
                            kwargs['checked'] = True
                        td_content = '<input {}>'.format(' '.join([
                            '{}="{}"'.format(cle, val)
                            for cle, val in kwargs.items()
                        ]))
                    tds.append('<td>{}</td>'.format(td_content))
                trs.append('<tr>{}</tr>'.format(''.join(tds)))

            gabarit = '''
			<div class="field-wrapper" id="fw_{}">
				<span class="field-label">{}</span>
				<div class="custom-table" id="dtable_{}">
					<table border="1" bordercolor="#DDD">
						<thead>
							<tr>{}</tr>
						</thead>
						<tbody>{}</tbody>
					</table>
				</div>
				<span class="field-error-message"></span>
			</div>
			'''.format(
             attr_name,
             labels[0],
             attr_name,
             ''.join(['<th>{}</th>'.format(
              elem if elem != '__zcc__' else '<input type="checkbox" id="id_{}__all" value="__ALL__">' \
              .format(attr_name)
             ) for elem in labels[1:]]),
             ''.join(trs)
            )
        elif type_champ == 'Textarea':
            gabarit = gabarit_defaut.format(attr_name, champ.label, champ__str)
        elif type_champ == 'TextInput':
            gabarit = gabarit_defaut.format(attr_name, champ.label, champ__str)
        elif type_champ == 'TimeInput':
            gabarit = gabarit_defaut.format(attr_name, champ.label, champ__str)
        else:
            gabarit = None

        # Empilement du tableau des champs sauf si aucun gabarit disponible
        if gabarit:
            output[champ.name] = safe(gabarit)
        else:
            raise ValueError(
                'Aucun gabarit n\'est disponible pour un champ {}.'.format(
                    type_champ))

    return output