def getVideoData(self, videoId):     
         requestUrl = "http://vimeo.com/moogaloop/load/clip:%s/local" % (videoId)
         req = urllib2.Request(requestUrl)
         req.add_header('User-Agent', "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-GB; rv:1.9.2.8) Gecko/20100722 Firefox/3.6.8")    
         print "vimeo api request:" + req.get_full_url()
         con = urllib2.urlopen(req)
         response = con.read()
         video = BeautifulSoup(response)    
         videoCaption = unescape(video.findChild("video").findChild("caption").getText())
         videoThumbnailUrl = video.findChild("video").findChild("thumbnail").getText()
         videoRequestSignature = video.findChild("request_signature").getText()
         videoRequestSignatureExpires = video.findChild("request_signature_expires").getText()
         videoStreamUrl = self.getStreamPath(videoId, videoRequestSignature, videoRequestSignatureExpires)        
         con.close()
         return {'caption':videoCaption, 'thumbnail':videoThumbnailUrl, "streamUrl":videoStreamUrl}
Beispiel #2
0
    def test_ajax_get(self):
        """Getting page via ajax returns just itemlist."""
        res = self.get(ajax=True, status=200)

        soup = BeautifulSoup(res.json["html"])

        # outermost element is class "itemlist"
        self.assertIn("itemlist", soup.findChild()["class"])
Beispiel #3
0
    def test_ajax_get(self):
        """Getting page via ajax returns just itemlist."""
        res = self.get(ajax=True, status=200)

        soup = BeautifulSoup(res.json["html"])

        # outermost element is class "itemlist"
        self.assertIn("itemlist", soup.findChild()["class"])
Beispiel #4
0
 def _scrape_sig_es_hps(self, alert):
     """
     Each alert is associated with two values in hidden inputs named "es"
     and "hps" which must be scraped and passed along when modifying it
     along with the "x" hidden input value to prevent xss attacks.
     """
     url = 'http://www.google.com/alerts/edit?hl=en&gl=us&s=%s' % alert._s
     response = self.opener.open(url)
     resp_code = response.getcode()
     body = response.read()
     if resp_code != 200:
         raise UnexpectedResponseError(
             resp_code,
             response.info().headers,
             body,
         )
     soup = BeautifulSoup(body)
     sig = soup.findChild('input', attrs={'name': 'x'})['value']
     es = soup.findChild('input', attrs={'name': 'es'})['value']
     hps = soup.findChild('input', attrs={'name': 'hps'})['value']
     return tuple(str(i) for i in (sig, es, hps))
Beispiel #5
0
 def _scrape_sig_es_hps(self, alert):
     """
     Each alert is associated with two values in hidden inputs named "es"
     and "hps" which must be scraped and passed along when modifying it
     along with the "x" hidden input value to prevent xss attacks.
     """
     url = 'http://www.google.com/alerts/edit?hl=en&gl=us&s=%s' % alert._s
     response = self.opener.open(url)
     resp_code = response.getcode()
     body = response.read()
     if resp_code != 200:
         raise UnexpectedResponseError(
             resp_code,
             response.info().headers,
             body,
             )
     soup = BeautifulSoup(body)
     sig = soup.findChild('input', attrs={'name': 'x'})['value']
     es = soup.findChild('input', attrs={'name': 'es'})['value']
     hps = soup.findChild('input', attrs={'name': 'hps'})['value']
     return tuple(str(i) for i in (sig, es, hps))
Beispiel #6
0
    def _scrape_sig_es_hps(self, alert):
        """
        Each alert is associated with two values in hidden inputs named "es"
        and "hps" which must be scraped and passed along when modifying it
        along with the "sig" hidden input value to prevent xss attacks.
        """
        es_hps_page_url = 'http://www.google.com/alerts/edit?hl=en&gl=us&s=%s' % alert._s 
        es_hps_response = self.opener.open(es_hps_page_url)

        body = es_hps_response.read()

        if es_hps_response.code != 200:
            raise UnexpectedResponseError(es_hps_response.code, es_hps_response.headers, body)
        
        soup = BeautifulSoup(body)
        sig = soup.findChild('input', attrs={'name': 'sig'})['value']
        sig = str(sig)
        es = soup.findChild('input', attrs={'name': 'es'})['value']
        es = str(es)
        hps = soup.findChild('input', attrs={'name': 'hps'})['value']
        hps = str(hps)
        return sig, es, hps
Beispiel #7
0
    def _scrape_sig(self, path='/alerts'):
        """
        Google signs forms with a value in a hidden input named "sig" to
        prevent xss attacks, so we need to scrape this out and submit it along
        with any forms we POST.
        """
        sig_page_url = 'http://www.google.com%s?hl=en&gl=us' % path 
        sig_response = self.opener.open(sig_page_url)

        body = sig_response.read()
        if sig_response.code != 200:
            raise UnexpectedResponseError(sig_response.code, sig_response.headers, body)
        soup = BeautifulSoup(body)
        sig = soup.findChild('input', attrs={'name': 'sig'})['value']
        return str(sig)
Beispiel #8
0
 def _scrape_sig_es_hps(self, alert):
     """
     Each alert is associated with two values in hidden inputs named "es"
     and "hps" which must be scraped and passed along when modifying it
     along with the "sig" hidden input value to prevent xss attacks.
     """
     headers = {'Cookie': self.cookie}
     conn = HTTPConnection('www.google.com')
     conn.request('GET', '/alerts/edit?hl=en&gl=us&s=%s' % alert._s, None, headers)
     response = conn.getresponse()
     body = response.read()
     try:
         if response.status != 200:
             raise UnexpectedResponseError(response.status, response.getheaders(), body)
     finally:
         conn.close()
     soup = BeautifulSoup(body)
     sig = soup.findChild('input', attrs={'name': 'sig'})['value']
     sig = str(sig)
     es = soup.findChild('input', attrs={'name': 'es'})['value']
     es = str(es)
     hps = soup.findChild('input', attrs={'name': 'hps'})['value']
     hps = str(hps)
     return sig, es, hps
Beispiel #9
0
def getShowsList():
    main_idx = fetchPage(INDEX_URL)
    idx_tree = BeautifulSoup(main_idx)

    showidx_links = filter(lambda x: x.get("href") != "/shows/", idx_tree.findChild(name="div", attrs={"id":re.compile("\\bbottombrowse\\b")}).findChildren(name="a", attrs={"class":re.compile("\\bnu\\b")}))

    showidx_links = map(lambda x: SHOW_URL + x.get('href').encode('utf-8'), showidx_links)
    shows_list = []
    for url in showidx_links:
        shows_list.extend(parseShowIndexPage(fetchPage(url)))

    shows_list.sort(key=lambda x: long(x["number"]), reverse=True)
    ret = {"shows":shows_list}
    ret["newest"] = max(ret["shows"], key=lambda(x): int(x["number"]))
    ret["oldest"] = min(ret["shows"], key=lambda(x): int(x["number"]))
    return ret
Beispiel #10
0
    def _najdi_datum_na_novost(self,url):
        '''Го зима датумот на промена на поединечна новост'''
        req = urllib2.Request(url)

        try:
            #Последна промена: 09:52 - среда, 16 јуни 2010Електротехнички факултет
            resp = urllib2.urlopen(req)
            soup = BeautifulSoup(resp.read())
            datum_div = soup.findChild('div',{'id':'footer'})
            
            #09:43 - среда, 16 јуни 2010
            datum = datum_div.getText()[18:-24]

            return datetime.strptime(datum.encode('utf-8'),"%H:%M - %A, %d %B %Y")
        except SyntaxError:
            return datetime.now()
Beispiel #11
0
 def _scrape_sig(self, path='/alerts'):
     """
     Google signs forms with a value in a hidden input named "x" to
     prevent xss attacks, so we need to scrape this out and submit it along
     with any forms we POST.
     """
     url = 'http://www.google.com%s' % path
     response = self.opener.open(url)
     resp_code = response.getcode()
     body = response.read()
     if resp_code != 200:
         raise UnexpectedResponseError(
             resp_code,
             response.info().headers,
             body,
         )
     soup = BeautifulSoup(body)
     sig = soup.findChild('input', attrs={'name': 'x'})['value']
     return str(sig)
Beispiel #12
0
 def _scrape_sig(self, path='/alerts'):
     """
     Google signs forms with a value in a hidden input named "x" to
     prevent xss attacks, so we need to scrape this out and submit it along
     with any forms we POST.
     """
     url = 'http://www.google.com%s' % path
     response = self.opener.open(url)
     resp_code = response.getcode()
     body = response.read()
     if resp_code != 200:
         raise UnexpectedResponseError(
             resp_code,
             response.info().headers,
             body,
             )
     soup = BeautifulSoup(body)
     sig = soup.findChild('input', attrs={'name': 'x'})['value']
     return str(sig)
Beispiel #13
0
 def _scrape_sig(self, path='/alerts'):
     """
     Google signs forms with a value in a hidden input named "sig" to
     prevent xss attacks, so we need to scrape this out and submit it along
     with any forms we POST.
     """
     headers = {'Cookie': self.cookie}
     conn = HTTPConnection('www.google.com')
     conn.request('GET', path, None, headers)
     response = conn.getresponse()
     body = response.read()
     try:
         if response.status != 200:
             raise UnexpectedResponseError(response.status, response.getheaders(), body)
     finally:
         conn.close()
     soup = BeautifulSoup(body)
     sig = soup.findChild('input', attrs={'name': 'sig'})['value']
     return str(sig)
def getOpeningLines():
    opener = urllib2.build_opener()
    response = opener.open ("http://www.vegasinsider.com/nhl/odds/las-vegas/")
    if response.info().get( 'Content-Encoding' ) == 'gzip':
        f = gzip.GzipFile( fileobj=_StringIO( response.read() ))
        page = f.read()
    else:
        print "Page wasn't coded with expected Content Encoding.  Ending execution."
        pass
    
    soup = BeautifulSoup( page )

    mainTable = soup.findChild( 'td', { "class" : "viBodyBorderNorm" })
    tables = mainTable.findAll( 'table' )

    oddsTable = tables[ 1 ]

    rows = oddsTable.findAll( 'tr' )

    for aRow in rows:
        teams = aRow.findChildren( 'a', { "class" : "tabletext" })
        print teams
Beispiel #15
0
def _parse_table(table_data):
    """
    Parse the html table structure which contains all headers and students.
    :param table_data: string with html table structure
    :return list of headings, list of all user data
    """
    parsed_html = BeautifulSoup(table_data, "html5lib")
    table = parsed_html.findChild("tbody")
    # Read the table content including the heading
    content = table.findChildren("tr")
    # Create a list with all table headers
    headers = [h.text for h in content[0].findChildren("th")]
    # Create a list with a tuple for each users
    # The elements in the tuple correspond to the data entries for a user.
    # The order of the elements inside the tuple is the same as inside the headers list.
    entries = [[e.text for e in user_row.findChildren("td")]
               for user_row in content[1:]]
    # Sanity check: Make sure that each entry contains all necessary information.
    if not all(len(e) == len(headers) for e in entries):
        raise CorruptDataException(
            "Some data entries missing requiered fields.")
    # Return the headers and the sorted data.
    return headers, sorted(entries, key=lambda e: e[0].lower())
# BeautifulSoup is required - install it using easy_install or pip
from BeautifulSoup import BeautifulSoup


################################## Parse file ##################################
xml_string  = open('books.xml').read()
xml         = BeautifulSoup(xml_string)

############################# Document properties ##############################
print type(xml)         # <class 'BeautifulSoup.BeautifulSoup'>
print xml.name          # [document]

########################### Root node (the catalog) ############################
# Equivalent statements
root = xml.findChild("catalog")
root = xml.catalog
print root.name         # "catalog"

#################################### Books #####################################
books = root.findAll(name="book")
print [tag.name  for tag in books]  # [u'book', u'book', u'book', u'book', u'book']
print [tag.text  for tag in books]  # All the text between "<book>" and "</book>", we probably don't want this
                                    # [u"Gambardella, MatthewXML Developer's GuideComputer44.952000-10-01An in-depth look at creating applications \n      with XML.", u'Ralls, KimMidnight RainFantasy5.952000-12-16A former architect battles corporate zombies, \n      an evil sorceress, and her own childhood to become queen \n      of the world.', u'Corets, EvaMaeve AscendantFantasy5.952000-11-17After the collapse of a nanotechnology \n      society in England, the young survivors lay the \n      foundation for a new society.', u"Corets, EvaOberon's LegacyFantasy5.952001-03-10In post-apocalypse England, the mysterious \n      agent known only as Oberon helps to create a new life \n      for the inhabitants of London. Sequel to Maeve \n      Ascendant.", u"Corets, EvaThe Sundered GrailFantasy5.952001-09-10The two daughters of Maeve, half-sisters, \n      battle one another for control of England. Sequel to \n      Oberon's Legacy."]
print [tag.attrs for tag in books]  # [[(u'id', u'bk101')], [(u'id', u'bk102')], [(u'id', u'bk103')], [(u'id', u'bk104')], [(u'id', u'bk105')]]

######################## Book titles (all 'title' tags) ########################
titles = root.findAll(name="title")
print [tag.name for tag in titles]  # [u'title', u'title', u'title', u'title', u'title']
print [tag.text for tag in titles]  # [u"XML Developer's Guide", u'Midnight Rain', u'Maeve Ascendant', u"Oberon's Legacy", u'The Sundered Grail']
Beispiel #17
0
def _create_elements_py(filename="_elements.py"):
    """ Gets data from webelements.com and creates _elements.py. """
    import re
    from pickle import dumps
    import urllib.request, urllib.parse, urllib.error
    from os.path import exists, join
    from BeautifulSoup import BeautifulSoup, HTMLParseError
    from ..physics import a0
    import quantities as pq

    atom_list = [  # 'Silicon', 'Hydrogen', 'Gold' ]
        'Ruthenium', 'Rhenium', 'Rutherfordium', 'Radium', 'Rubidium',
        'Radon', 'Rhodium', 'Beryllium', 'Barium', 'Bohrium', 'Bismuth',
                 'Berkelium', 'Bromine', 'Hydrogen', 'Phosphorus', 'Osmium', 'Mercury',
                 'Germanium', 'Gadolinium', 'Gallium', 'Ununbium', 'Praseodymium',
                 'Platinum', 'Plutonium', 'Carbon', 'Lead', 'Protactinium', 'Palladium',
                 'Xenon', 'Polonium', 'Promethium', 'Hassium',
                 'Holmium', 'Hafnium', 'Molybdenum', 'Helium', 'Mendelevium', 'Magnesium',
                 'Potassium', 'Manganese', 'Oxygen', 'Meitnerium', 'Sulfur', 'Tungsten',
                 'Zinc', 'Europium', 'Einsteinium', 'Erbium', 'Nickel', 'Nobelium',
                 'Sodium', 'Niobium', 'Neodymium', 'Neon', 'Neptunium', 'Francium', 'Iron',
                 'Fermium', 'Boron', 'Fluorine', 'Strontium', 'Nitrogen', 'Krypton',
                 'Silicon', 'Tin', 'Samarium', 'Vanadium', 'Scandium', 'Antimony',
                 'Seaborgium', 'Selenium', 'Cobalt', 'Curium', 'Chlorine', 'Calcium',
                 'Californium', 'Cerium', 'Cadmium', 'Thulium', 'Caesium', 'Chromium',
                 'Copper', 'Lanthanum', 'Lithium', 'Thallium', 'Lutetium', 'Lawrencium',
                 'Thorium', 'Titanium', 'Tellurium', 'Terbium', 'Technetium', 'Tantalum',
                 'Ytterbium', 'Dubnium', 'Zirconium', 'Dysprosium', 'Iodine', 'Uranium',
                 'Yttrium', 'Actinium', 'Silver', 'Iridium', 'Americium', 'Aluminium',
                 'Arsenic', 'Argon', 'Gold', 'Astatine', 'Indium']

    orbital_radii = _orbital_radii()
    pettifor_numbers = _pettifor_numbers()

    re_swf = re.compile(r"(rainbow|NI3|volcano|\_flash|K\_H2O).swf\s*(?!\")")
    re_atomweight = re.compile(r":\s*\[?\s*(\d+(?:\.\d+)?)\s*\]?")
    results = {}
    for name in atom_list:

        # first opens and reads file.
        if not exists(join("elements", name)):
            file = urllib.request.urlopen("http://www.webelements.com/{0}".format(name.lower()))
            string = file.read()
            file.close()
        else:
            with open(join("elements", name), "r") as file:
                string = file.read()
        string = string.replace("alt\"", "alt=\"")
        soup = BeautifulSoup(re.sub(re_swf, "rainbow.swf\"", string))

        atom = Element(name=name)
        atom.symbol = soup.findChild(name="a", attrs={"title": "Element names and symbols"},
                                     text=" Symbol").parent.parent.contents[1].split()[1]
        atom.atomic_number = soup.findChild(name="a", attrs={"title": "Element atomic numbers"})\
            .parent.contents[-1].split()[1]
        atom.atomic_number = int(atom.atomic_number)
        atom.atomic_weight = soup.findChild(name="a", attrs={"title": "Element atomic weights"})\
            .parent.prettify()
        found = re_atomweight.search(atom.atomic_weight)
        if found is None:
            print(name)
        else:
            atom.atomic_weight = float(found.group(1))

        # ionization stuff
        if not exists(join("elements", name + "_atoms.html")):
            file = urllib.request.urlopen("http://www.webelements.com/{0}/atoms.html".format(name.lower()))
            string = file.read()
            file.close()
        else:
            with open(join("elements", name + "_atoms.html"), "r") as file:
                string = file.read()
        soup = BeautifulSoup(string)
        # electron affinity
        found = re.search(r"of\s+{0}\s+is\s+(\S+)".format(name.lower()), string)
        if found.group(1) == "no":
            atom.electron_affinity = None
        else:
            atom.electron_affinity = float(found.group(1)) * pq.kilo * pq.J / pq.mol
        # ionization energies
        energies = []
        for child in soup.findChild(name="table", attrs={"class": "chemistry-data"})\
                         .findChildren(name='td'):
            energies.append(float(child.string) * pq.kilo * pq.J / pq.mol)
        atom.ionization_energies = energies if len(energies) > 0 else None

        # electronegativities.
        if not exists(join("elements", name + "_electronegativity.html")):
            file = urllib.request.urlopen("http://www.webelements.com/{0}/electronegativity.html"
                                  .format(name.lower()))
            string = file.read()
            file.close()
        else:
            with open(join("elements", name + "_electronegativity.html"), "r") as file:
                string = file.read()
        soup = BeautifulSoup(string)
        attrs = {"href": "../periodicity/electronegativity_pauling/",
                 "title": "View definition and pictures showing periodicity "
                 "of Pauling electronegativity"}
        pauling = soup.findChild(name="a", attrs=attrs).parent.parent.contents[-1].string
        pauling = pauling.split()[0]
        atom.pauling = float(pauling) if pauling != "no" else None

        attrs = {"href": "../periodicity/electronegativity_sanderson/"}
        sanderson = soup.findChild(name="a", attrs=attrs).parent.parent.contents[-1].string
        sanderson = sanderson.split()[0]
        atom.sanderson = float(sanderson) if sanderson != "no" else None

        attrs = {"href": "../periodicity/electroneg_allred_rochow/"}
        allred_rochow = soup.findChild(name="a", attrs=attrs).parent.parent.contents[-1].string
        allred_rochow = allred_rochow.split()[0]
        atom.allred_rochow = float(allred_rochow) if allred_rochow != "no" else None

        attrs = {"href": "../periodicity/electroneg_mulliken_jaffe/"}
        mulliken_jaffe = soup.findChild(name="a", attrs=attrs).parent.parent.contents[-1]
        if name in ["Germanium", "Gallium", "Carbon", "Lead", "Boron", "Silicon", "Tin",
                    "Thallium", "Aluminium", "Indium"]:
            mulliken_jaffe = mulliken_jaffe.contents[0]
        else:
            mulliken_jaffe = mulliken_jaffe.string
        mulliken_jaffe = mulliken_jaffe.split()[0]
        atom.mulliken_jaffe = float(mulliken_jaffe) if mulliken_jaffe != "no" else None

        attrs = {"href": "../periodicity/electronegativity_allen/"}
        allen = soup.findChild(name="a", attrs=attrs).parent.parent.contents[-1].string
        allen = allen.split()[0]
        atom.allen = float(allen) if allen != "no" else None

        # atom sizes
        if not exists(join("elements", name + "_atom_sizes.html")):
            file = urllib.request.urlopen("http://www.webelements.com/{0}/atom_sizes.html"
                                  .format(name.lower()))
            string = file.read()
            file.close()
        else:
            with open(join("elements", name + "_atom_sizes.html"), "r") as file:
                string = file.read()
        soup = BeautifulSoup(string)

        # atomic radius
        attrs = {"href": "../periodicity/atomic_radius_empirical/"}
        atomic_radius = soup.findChild(name="a", attrs=attrs).parent.contents[-1].split()[1]
        if atomic_radius != "no":
            atom.atomic_radius = float(atomic_radius) * pq.picometre

        attrs = {"href": "../periodicity/covalent_radius_2008/"}
        covalent_radius = soup.findChild(name="a", attrs=attrs).parent.contents[-1].split()[1]
        atom.covalent_radius = float(covalent_radius) * \
            pq.picometre if covalent_radius != "no" else None

        attrs = {"href": "../periodicity/radii_covalent_single/"}
        single_bond_radius = soup.findChild(name="a", attrs=attrs)
        if single_bond_radius is not None:
            single_bond_radius = single_bond_radius.parent.contents[-1].split()[1]
            if single_bond_radius != "no":
                atom.single_bond_radius = float(single_bond_radius) * pq.picometre

        attrs = {"href": "../periodicity/radii_covalent_double/"}
        double_bond_radius = soup.findChild(name="a", attrs=attrs)
        if double_bond_radius is not None:
            double_bond_radius = double_bond_radius.parent.contents[-1].split()[1]
            if double_bond_radius != "no":
                atom.double_bond_radius = float(double_bond_radius) * pq.picometre

        attrs = {"href": "../periodicity/radii_covalent_triple/"}
        triple_bond_radius = soup.findChild(name="a", attrs=attrs)
        if triple_bond_radius is not None:
            triple_bond_radius = triple_bond_radius.parent.contents[-1].split()[1]
            if triple_bond_radius != "no":
                atom.triple_bond_radius = float(triple_bond_radius) * pq.picometre

        attrs = {"href": "../periodicity/van_der_waals_radius/"}
        van_der_waals_radius = soup.findChild(name="a", attrs=attrs)
        if van_der_waals_radius is not None:
            van_der_waals_radius = van_der_waals_radius.parent.contents[-1].split()[1]
            if van_der_waals_radius != "no":
                atom.van_der_waals_radius = float(van_der_waals_radius) * pq.picometre

        # thermochemistry
        if not exists(join("elements", name + "_thermochemistry.html")):
            file = urllib.request.urlopen("http://www.webelements.com/{0}/thermochemistry.html"
                                  .format(name.lower()))
            string = file.read()
            file.close()
        else:
            with open(join("elements", name + "_thermochemistry.html"), "r") as file:
                string = file.read()
        soup = BeautifulSoup(string)

        attrs = {"href": "../periodicity/enthalpy_fusion/"}
        fusion = soup.findChild(name="a", attrs=attrs).parent.prettify()
        fusion = re.search(r":\s*(?:about)?\s*(\S+)", fusion)
        if fusion is not None and fusion.group(1) != "no":
            atom.fusion = float(fusion.group(1)) * pq.kilo * pq.J / pq.mol

        attrs = {"href": "../periodicity/enthalpy_vaporisation/"}
        vaporization = soup.findChild(name="a", attrs=attrs).parent.prettify()
        vaporization = re.search(r":\s*(?:about)?\s*(\S+)", vaporization)
        if vaporization is not None and vaporization.group(1) != "no":
            atom.vaporization = float(vaporization.group(1)) * pq.kilo * pq.J / pq.mol

        attrs = {"href": "../periodicity/enthalpy_atomisation/"}
        atomization = soup.findChild(name="a", attrs=attrs).parent.prettify()
        atomization = re.search(r":\s*(?:about)?\s*(\S+)", atomization)
        if atomization is not None and atomization.group(1) != "no":
            atom.atomization = float(atomization.group(1)) * pq.kilo * pq.J / pq.mol

        # physics
        if not exists(join("elements", name + "_physics.html")):
            file = urllib.request.urlopen("http://www.webelements.com/{0}/physics.html"
                                  .format(name.lower()))
            string = file.read()
            file.close()
        else:
            with open(join("elements", name + "_physics.html"), "r") as file:
                string = file.read()
        soup = BeautifulSoup(string)

        attrs = {"href": "../periodicity/melting_point/"}
        melting_point = soup.findChild(name="a", attrs=attrs).parent.prettify()
        melting_point = re.search(r":\s*(?:\(white P\)|about|maybe about)?\s*(\S+)", melting_point)
        if melting_point is not None and melting_point.group(1) != "no":
            atom.melting_point = float(melting_point.group(1)) * pq.Kelvin

        attrs = {"href": "../periodicity/boiling_point/"}
        boiling_point = soup.findChild(name="a", attrs=attrs).parent.prettify()
        boiling_point = re.search(r":\s*(?:about)?\s*(\S+)", boiling_point)
        if boiling_point is not None and boiling_point.group(1) != "no":
            atom.boiling_point = float(boiling_point.group(1)) * pq.Kelvin

        attrs = {"href": "../periodicity/critical_temperature/"}
        critical_temperature = soup.findChild(name="a", attrs=attrs).parent.prettify()
        critical_temperature = re.search(r":\s*(?:about)?\s*(\S+)", critical_temperature)
        if critical_temperature is not None and critical_temperature.group(1) != "no":
            atom.critical_temperature = float(critical_temperature.group(1)) * pq.Kelvin

        attrs = {"href": "../periodicity/thermal_conductivity/"}
        thermal_conductivity = soup.findChild(name="a", attrs=attrs).parent.prettify()
        thermal_conductivity = re.search(r":\s*(?:about)?\s*(\S+)", thermal_conductivity)
        if thermal_conductivity is not None and thermal_conductivity.group(1) != "no":
            atom.thermal_conductivity = float(thermal_conductivity.group(1)) * pq.W / pq.m / pq.K

        attrs = {"href": "../periodicity/coeff_thermal_expansion/"}
        thermal_expansion = soup.findChild(name="a", attrs=attrs).parent.prettify()
        thermal_expansion = re.search(r":\s*(?:about)?\s*(\S+)", thermal_expansion)
        if thermal_expansion is not None and thermal_expansion.group(1) != "no":
            atom.thermal_expansion = float(thermal_expansion.group(1)) * pq.micro / pq.K

        attrs = {"href": "../periodicity/density/"}
        density = soup.findChild(name="a", attrs=attrs).parent.prettify()
        density = re.search(r":\s*(?:about)?\s*(\S+)", density)
        if density is not None and density.group(1) != "no":
            atom.density = float(density.group(1)) / 1000 * pq.g * pq.cm**3

        attrs = {"href": "../periodicity/molar_volume/"}
        molar_volume = soup.findChild(name="a", attrs=attrs).parent.prettify()
        molar_volume = re.search(r":\s*(?:about)?\s*(\S+)", molar_volume)
        if molar_volume is not None and molar_volume.group(1) != "no":
            atom.molar_volume = float(molar_volume.group(1)) * pq.cm**3 / pq.mol

        attrs = {"href": "../periodicity/velocity_sound/"}
        sound_velocity = soup.findChild(name="a", attrs=attrs).parent.prettify()
        sound_velocity = re.search(r":\s*(?:about)?\s*(\S+)", sound_velocity)
        if sound_velocity is not None and sound_velocity.group(1) != "no":
            atom.sound_velocity = float(sound_velocity.group(1)) * pq.m / pq.s

        attrs = {"href": "../periodicity/youngs_modulus/"}
        young_modulus = soup.findChild(name="a", attrs=attrs).parent.prettify()
        young_modulus = re.search(r":\s*(?:about)?\s*(\S+)", young_modulus)
        if young_modulus is not None and young_modulus.group(1) != "no":
            atom.young_modulus = float(young_modulus.group(1)) * pq.GPa

        attrs = {"href": "../periodicity/rigidity_modulus/"}
        rigidity_modulus = soup.findChild(name="a", attrs=attrs).parent.prettify()
        rigidity_modulus = re.search(r":\s*(?:about)?\s*(\S+)", rigidity_modulus)
        if rigidity_modulus is not None and rigidity_modulus.group(1) != "no":
            atom.rigidity_modulus = float(rigidity_modulus.group(1)) * pq.GPa

        attrs = {"href": "../periodicity/bulk_modulus/"}
        bulk_modulus = soup.findChild(name="a", attrs=attrs).parent.prettify()
        bulk_modulus = re.search(r":\s*(?:about)?\s*(\S+)", bulk_modulus)
        if bulk_modulus is not None and bulk_modulus.group(1) != "no":
            atom.bulk_modulus = float(bulk_modulus.group(1)) * pq.GPa

        attrs = {"href": "../periodicity/poissons_ratio/"}
        poisson_ratio = soup.findChild(name="a", attrs=attrs).parent.prettify()
        poisson_ratio = re.search(r":\s*(?:about)?\s*(\S+)", poisson_ratio)
        if poisson_ratio is not None and poisson_ratio.group(1) != "no":
            atom.poisson_ratio = float(poisson_ratio.group(1)) * pq.dimensionless

        attrs = {"href": "../periodicity/electrical_resistivity/"}
        electrical_resistivity = soup.findChild(name="a", attrs=attrs).parent.prettify()
        electrical_resistivity = re.search(
            r":\s*(?:about)?\s*(\d+(?:\.\d+)?)", electrical_resistivity)
        if electrical_resistivity is not None and electrical_resistivity.group(1) not in ["no", "&gt;"]:
            atom.electrical_resistivity = float(
                electrical_resistivity.group(1)) * 1e-8 * pq.ohm * pq.m

        results[str(atom.symbol)] = atom

        if atom.symbol in orbital_radii:
            au = a0("A") * pq.angstrom
            results[str(atom.symbol)].orbital_radii = tuple(
                [u * au for u in orbital_radii[atom.symbol]])
        if atom.symbol in pettifor_numbers:
            results[str(atom.symbol)].pettifor = pettifor_numbers[atom.symbol]

    with open(filename, "w") as file:
        file.write("\"\"\" Definition of the elements. \"\"\"\n")
        file.write("\nfrom numpy import array\n")
        file.write("\nfrom quantities import *\n")
        file.write("\nfrom . import Element\n")
        file.write("\n__dir__ = ['elements', 'symbols']\n")
        file.write("\nelements = " + repr(results) + "\n")
        keys = []
        for n in range(1, len(results)):
            for key, value in results.items():
                if value.atomic_number == n:
                    keys.append(str(key))
        file.write("\nsymbols = {0}\n".format(keys))
Beispiel #18
0
def _create_elements_py(filename="_elements.py"):
  """ Gets data from webelements.com and creates _elements.py. """
  import re
  from pickle import dumps
  import urllib
  from os.path import exists, join
  from BeautifulSoup import BeautifulSoup, HTMLParseError
  from ..physics import a0
  import quantities as pq

  atom_list = [ # 'Silicon', 'Hydrogen', 'Gold' ] 
               'Ruthenium', 'Rhenium', 'Rutherfordium', 'Radium', 'Rubidium',
               'Radon', 'Rhodium', 'Beryllium', 'Barium', 'Bohrium', 'Bismuth',
               'Berkelium', 'Bromine', 'Hydrogen', 'Phosphorus', 'Osmium', 'Mercury',
               'Germanium', 'Gadolinium', 'Gallium', 'Ununbium', 'Praseodymium',
               'Platinum', 'Plutonium', 'Carbon', 'Lead', 'Protactinium', 'Palladium',
               'Xenon', 'Polonium', 'Promethium', 'Hassium',
               'Holmium', 'Hafnium', 'Molybdenum', 'Helium', 'Mendelevium', 'Magnesium',
               'Potassium', 'Manganese', 'Oxygen', 'Meitnerium', 'Sulfur', 'Tungsten',
               'Zinc', 'Europium', 'Einsteinium', 'Erbium', 'Nickel', 'Nobelium',
               'Sodium', 'Niobium', 'Neodymium', 'Neon', 'Neptunium', 'Francium', 'Iron',
               'Fermium', 'Boron', 'Fluorine', 'Strontium', 'Nitrogen', 'Krypton',
               'Silicon', 'Tin', 'Samarium', 'Vanadium', 'Scandium', 'Antimony',
               'Seaborgium', 'Selenium', 'Cobalt', 'Curium', 'Chlorine', 'Calcium',
               'Californium', 'Cerium', 'Cadmium', 'Thulium', 'Caesium', 'Chromium',
               'Copper', 'Lanthanum', 'Lithium', 'Thallium', 'Lutetium', 'Lawrencium',
               'Thorium', 'Titanium', 'Tellurium', 'Terbium', 'Technetium', 'Tantalum',
               'Ytterbium', 'Dubnium', 'Zirconium', 'Dysprosium', 'Iodine', 'Uranium',
               'Yttrium', 'Actinium', 'Silver', 'Iridium', 'Americium', 'Aluminium',
               'Arsenic', 'Argon', 'Gold', 'Astatine', 'Indium']

  orbital_radii = _orbital_radii()
  pettifor_numbers = _pettifor_numbers()

  re_swf = re.compile("(rainbow|NI3|volcano|\_flash|K\_H2O).swf\s*(?!\")")
  re_atomweight = re.compile(":\s*\[?\s*(\d+(?:\.\d+)?)\s*\]?")
  results = {}
  for name in atom_list: 

    # first opens and reads file.
    if not exists(join("elements", name)): 
      file = urllib.urlopen("http://www.webelements.com/{0}".format(name.lower()))
      string = file.read()
      file.close()
    else:
      with open(join("elements", name), "r") as file: string = file.read()
    string = string.replace("alt\"", "alt=\"")
    soup = BeautifulSoup(re.sub(re_swf,"rainbow.swf\"",string))

    atom = Element(name=name)
    atom.symbol = soup.findChild( name="a", attrs={"title": "Element names and symbols"},\
                                  text=" Symbol").parent.parent.contents[1].split()[1]
    atom.atomic_number = soup.findChild(name="a", attrs={"title": "Element atomic numbers"})\
                                       .parent.contents[-1].split()[1]
    atom.atomic_number = int(atom.atomic_number)
    atom.atomic_weight = soup.findChild(name="a", attrs={"title": "Element atomic weights"})\
                                       .parent.prettify()
    found = re_atomweight.search(atom.atomic_weight)
    if found is None: print name
    else: atom.atomic_weight = float(found.group(1))

    
    # ionization stuff
    if not exists(join("elements", name + "_atoms.html")):
      file = urllib.urlopen("http://www.webelements.com/{0}/atoms.html".format(name.lower()))
      string = file.read()
      file.close()
    else: 
      with open(join("elements", name + "_atoms.html"), "r") as file: string = file.read()
    soup = BeautifulSoup(string) 
    # electron affinity
    found = re.search("of\s+{0}\s+is\s+(\S+)".format(name.lower()), string)
    if found.group(1) == "no": atom.electron_affinity = None
    else: atom.electron_affinity = float(found.group(1)) * pq.kilo * pq.J / pq.mol
    # ionization energies
    energies = []
    for child in soup.findChild(name="table", attrs={"class":"chemistry-data"})\
                     .findChildren(name='td'):
      energies.append(float(child.string) * pq.kilo * pq.J / pq.mol)
    atom.ionization_energies = energies if len(energies) > 0 else None


    # electronegativities.
    if not exists(join("elements", name + "_electronegativity.html")):
      file = urllib.urlopen("http://www.webelements.com/{0}/electronegativity.html"\
                            .format(name.lower()))
      string = file.read()
      file.close()
    else: 
      with open(join("elements", name + "_electronegativity.html"), "r") as file:
          string = file.read()
    soup = BeautifulSoup(string) 
    attrs = { "href": "../periodicity/electronegativity_pauling/",\
              "title": "View definition and pictures showing periodicity "\
                       "of Pauling electronegativity"}
    pauling = soup.findChild(name="a", attrs=attrs).parent.parent.contents[-1].string
    pauling = pauling.split()[0]
    atom.pauling = float(pauling) if pauling != "no" else None

    attrs = { "href": "../periodicity/electronegativity_sanderson/" }
    sanderson = soup.findChild(name="a", attrs=attrs).parent.parent.contents[-1].string
    sanderson = sanderson.split()[0]
    atom.sanderson = float(sanderson) if sanderson != "no" else None

    attrs = { "href": "../periodicity/electroneg_allred_rochow/" }
    allred_rochow = soup.findChild(name="a", attrs=attrs).parent.parent.contents[-1].string
    allred_rochow = allred_rochow.split()[0]
    atom.allred_rochow = float(allred_rochow) if allred_rochow != "no" else None

    attrs = { "href": "../periodicity/electroneg_mulliken_jaffe/" }
    mulliken_jaffe = soup.findChild(name="a", attrs=attrs).parent.parent.contents[-1]
    if name in ["Germanium", "Gallium", "Carbon", "Lead", "Boron", "Silicon", "Tin",\
                "Thallium", "Aluminium", "Indium"]: 
      mulliken_jaffe = mulliken_jaffe.contents[0]
    else: mulliken_jaffe = mulliken_jaffe.string
    mulliken_jaffe = mulliken_jaffe.split()[0]
    atom.mulliken_jaffe = float(mulliken_jaffe) if mulliken_jaffe != "no" else None

    attrs = { "href": "../periodicity/electronegativity_allen/" }
    allen = soup.findChild(name="a", attrs=attrs).parent.parent.contents[-1].string
    allen = allen.split()[0]
    atom.allen = float(allen) if allen != "no" else None
    
    # atom sizes
    if not exists(join("elements", name + "_atom_sizes.html")):
      file = urllib.urlopen("http://www.webelements.com/{0}/atom_sizes.html"\
                            .format(name.lower()))
      string = file.read()
      file.close()
    else: 
      with open(join("elements", name + "_atom_sizes.html"), "r") as file:
          string = file.read()
    soup = BeautifulSoup(string) 
    
    # atomic radius
    attrs = { "href": "../periodicity/atomic_radius_empirical/" }
    atomic_radius = soup.findChild(name="a", attrs=attrs).parent.contents[-1].split()[1]
    if atomic_radius != "no":
      atom.atomic_radius = float(atomic_radius) * pq.picometre 
    
    attrs = { "href": "../periodicity/covalent_radius_2008/" }
    covalent_radius = soup.findChild(name="a", attrs=attrs).parent.contents[-1].split()[1]
    atom.covalent_radius = float(covalent_radius) * pq.picometre if covalent_radius != "no" else None

    attrs = { "href": "../periodicity/radii_covalent_single/" }
    single_bond_radius = soup.findChild(name="a", attrs=attrs)
    if single_bond_radius is not None:
      single_bond_radius = single_bond_radius.parent.contents[-1].split()[1]
      if single_bond_radius != "no": 
        atom.single_bond_radius = float(single_bond_radius) * pq.picometre

    attrs = { "href": "../periodicity/radii_covalent_double/" }
    double_bond_radius = soup.findChild(name="a", attrs=attrs)
    if double_bond_radius is not None:
      double_bond_radius = double_bond_radius.parent.contents[-1].split()[1]
      if double_bond_radius != "no": 
        atom.double_bond_radius = float(double_bond_radius) * pq.picometre

    attrs = { "href": "../periodicity/radii_covalent_triple/" }
    triple_bond_radius = soup.findChild(name="a", attrs=attrs)
    if triple_bond_radius is not None:
      triple_bond_radius = triple_bond_radius.parent.contents[-1].split()[1]
      if triple_bond_radius != "no": 
        atom.triple_bond_radius = float(triple_bond_radius) * pq.picometre

    attrs = { "href": "../periodicity/van_der_waals_radius/" }
    van_der_waals_radius = soup.findChild(name="a", attrs=attrs)
    if van_der_waals_radius is not None:
      van_der_waals_radius = van_der_waals_radius.parent.contents[-1].split()[1]
      if van_der_waals_radius != "no": 
        atom.van_der_waals_radius = float(van_der_waals_radius) * pq.picometre

    # thermochemistry
    if not exists(join("elements", name + "_thermochemistry.html")):
      file = urllib.urlopen("http://www.webelements.com/{0}/thermochemistry.html"\
                            .format(name.lower()))
      string = file.read()
      file.close()
    else: 
      with open(join("elements", name + "_thermochemistry.html"), "r") as file:
          string = file.read()
    soup = BeautifulSoup(string) 
    
    attrs = { "href": "../periodicity/enthalpy_fusion/" }
    fusion = soup.findChild(name="a", attrs=attrs).parent.prettify()
    fusion = re.search(":\s*(?:about)?\s*(\S+)", fusion)
    if fusion is not None and fusion.group(1) != "no":
      atom.fusion = float(fusion.group(1)) * pq.kilo * pq.J / pq.mol 

    attrs = { "href": "../periodicity/enthalpy_vaporisation/" }
    vaporization = soup.findChild(name="a", attrs=attrs).parent.prettify()
    vaporization = re.search(":\s*(?:about)?\s*(\S+)", vaporization)
    if vaporization is not None and vaporization.group(1) != "no":
      atom.vaporization = float(vaporization.group(1)) * pq.kilo * pq.J / pq.mol 

    attrs = { "href": "../periodicity/enthalpy_atomisation/" }
    atomization = soup.findChild(name="a", attrs=attrs).parent.prettify()
    atomization = re.search(":\s*(?:about)?\s*(\S+)", atomization)
    if atomization is not None and atomization.group(1) != "no":
      atom.atomization = float(atomization.group(1)) * pq.kilo * pq.J / pq.mol 

    # physics
    if not exists(join("elements", name + "_physics.html")):
      file = urllib.urlopen("http://www.webelements.com/{0}/physics.html"\
                            .format(name.lower()))
      string = file.read()
      file.close()
    else: 
      with open(join("elements", name + "_physics.html"), "r") as file:
          string = file.read()
    soup = BeautifulSoup(string) 

    attrs = { "href": "../periodicity/melting_point/" }
    melting_point = soup.findChild(name="a", attrs=attrs).parent.prettify()
    melting_point = re.search(":\s*(?:\(white P\)|about|maybe about)?\s*(\S+)", melting_point)
    if melting_point is not None and melting_point.group(1) != "no":
      atom.melting_point = float(melting_point.group(1)) * pq.Kelvin

    attrs = { "href": "../periodicity/boiling_point/" }
    boiling_point = soup.findChild(name="a", attrs=attrs).parent.prettify()
    boiling_point = re.search(":\s*(?:about)?\s*(\S+)", boiling_point)
    if boiling_point is not None and boiling_point.group(1) != "no":
      atom.boiling_point = float(boiling_point.group(1)) * pq.Kelvin

    attrs = { "href": "../periodicity/critical_temperature/" }
    critical_temperature = soup.findChild(name="a", attrs=attrs).parent.prettify()
    critical_temperature = re.search(":\s*(?:about)?\s*(\S+)", critical_temperature)
    if critical_temperature is not None and critical_temperature.group(1) != "no":
      atom.critical_temperature = float(critical_temperature.group(1)) * pq.Kelvin

    attrs = { "href": "../periodicity/thermal_conductivity/" }
    thermal_conductivity = soup.findChild(name="a", attrs=attrs).parent.prettify()
    thermal_conductivity = re.search(":\s*(?:about)?\s*(\S+)", thermal_conductivity)
    if thermal_conductivity is not None and thermal_conductivity.group(1) != "no":
      atom.thermal_conductivity = float(thermal_conductivity.group(1)) * pq.W / pq.m / pq.K

    attrs = { "href": "../periodicity/coeff_thermal_expansion/" }
    thermal_expansion = soup.findChild(name="a", attrs=attrs).parent.prettify()
    thermal_expansion = re.search(":\s*(?:about)?\s*(\S+)", thermal_expansion)
    if thermal_expansion is not None and thermal_expansion.group(1) != "no":
      atom.thermal_expansion = float(thermal_expansion.group(1)) * pq.micro / pq.K

    attrs = { "href": "../periodicity/density/" }
    density = soup.findChild(name="a", attrs=attrs).parent.prettify()
    density = re.search(":\s*(?:about)?\s*(\S+)", density)
    if density is not None and density.group(1) != "no":
      atom.density = float(density.group(1)) / 1000 * pq.g * pq.cm**3

    attrs = { "href": "../periodicity/molar_volume/" }
    molar_volume = soup.findChild(name="a", attrs=attrs).parent.prettify()
    molar_volume = re.search(":\s*(?:about)?\s*(\S+)", molar_volume)
    if molar_volume is not None and molar_volume.group(1) != "no":
      atom.molar_volume = float(molar_volume.group(1)) * pq.cm**3 / pq.mol

    attrs = { "href": "../periodicity/velocity_sound/" }
    sound_velocity = soup.findChild(name="a", attrs=attrs).parent.prettify()
    sound_velocity = re.search(":\s*(?:about)?\s*(\S+)", sound_velocity)
    if sound_velocity is not None and sound_velocity.group(1) != "no":
      atom.sound_velocity = float(sound_velocity.group(1)) * pq.m / pq.s

    attrs = { "href": "../periodicity/youngs_modulus/" }
    young_modulus = soup.findChild(name="a", attrs=attrs).parent.prettify()
    young_modulus = re.search(":\s*(?:about)?\s*(\S+)", young_modulus)
    if young_modulus is not None and young_modulus.group(1) != "no":
      atom.young_modulus = float(young_modulus.group(1)) * pq.GPa

    attrs = { "href": "../periodicity/rigidity_modulus/" }
    rigidity_modulus = soup.findChild(name="a", attrs=attrs).parent.prettify()
    rigidity_modulus = re.search(":\s*(?:about)?\s*(\S+)", rigidity_modulus)
    if rigidity_modulus is not None and rigidity_modulus.group(1) != "no":
      atom.rigidity_modulus = float(rigidity_modulus.group(1)) * pq.GPa
    
    attrs = { "href": "../periodicity/bulk_modulus/" }
    bulk_modulus = soup.findChild(name="a", attrs=attrs).parent.prettify()
    bulk_modulus = re.search(":\s*(?:about)?\s*(\S+)", bulk_modulus)
    if bulk_modulus is not None and bulk_modulus.group(1) != "no":
      atom.bulk_modulus = float(bulk_modulus.group(1)) * pq.GPa
    
    attrs = { "href": "../periodicity/poissons_ratio/" }
    poisson_ratio = soup.findChild(name="a", attrs=attrs).parent.prettify()
    poisson_ratio = re.search(":\s*(?:about)?\s*(\S+)", poisson_ratio)
    if poisson_ratio is not None and poisson_ratio.group(1) != "no":
      atom.poisson_ratio = float(poisson_ratio.group(1)) * pq.dimensionless
    
    attrs = { "href": "../periodicity/electrical_resistivity/" }
    electrical_resistivity = soup.findChild(name="a", attrs=attrs).parent.prettify()
    electrical_resistivity = re.search(":\s*(?:about)?\s*(\d+(?:\.\d+)?)", electrical_resistivity)
    if electrical_resistivity is not None and electrical_resistivity.group(1) not in ["no", "&gt;"]:
      atom.electrical_resistivity = float(electrical_resistivity.group(1)) * 1e-8 * pq.ohm * pq.m

    results[str(atom.symbol)] = atom
    
    if atom.symbol in orbital_radii:
      au = a0("A") * pq.angstrom 
      results[str(atom.symbol)].orbital_radii = tuple([u * au for u in orbital_radii[atom.symbol]])
    if atom.symbol in pettifor_numbers:
      results[str(atom.symbol)].pettifor = pettifor_numbers[atom.symbol]