def parse_itunes_transactions(mail_body):
    soup = BeautifulSoup(mail_body)
    receipt = soup.findChildren('table')[2]
    date = list(receipt.findChildren('td')[2].stripped_strings)[3]
    items = []
    number_of_categories = 5
    i = 0
    transaction_table = soup.findChildren('table')[4]
    transactions = transaction_table.findChildren(['tr'])
    for transaction in transactions:
        cells = transaction.findChildren('td')
        row_values = [account]
        for cell in cells:
            values = list(cell.stripped_strings)
            if len(values) > 0:
                text = values[0].encode(csv_text_encoding)
                if (i > 0):
                    row_values.append(get_itunes_values(len(row_values), text))
                else:
                    row_values.append(text)
        if len(row_values) == number_of_categories:
            if i > 0:
                row_values.append(date)
                items.append(row_values)
            i += 1
    return items
Ejemplo n.º 2
0
def extract_details(painting_table: BeautifulSoup) -> dict:
    """Extracts necessary metadata for an image from a table found on the
    image's information page

    Parameters
    ----------
    painting_table : BeautifulSoup Tag, required
        Table found on the info page of an image. Holds important metadata for
        the image including title, artist, medium, date, etc.

    Returns
    -------
    dict
        Dictionary containing all of the metadata for the image
    """

    details = {}

    # Each 'dd' tag represents the category for the piece of metadata
    painting_info = painting_table.findChildren("dd")

    # Each 'dt' tag represents the actual piece of info tied to the 'dd' tag
    info_heading = painting_table.findChildren("dt")

    # For each piece of metadata, tie the info to the category in the dict
    for child in range(0, len(painting_info)):
        details[info_heading[child].text.strip(
        )] = painting_info[child].text.strip()

    # Return the dictionary
    return details
Ejemplo n.º 3
0
def drugMentions():
    dir = os.path.dirname(__file__)
    finalList = []
    textList = []

    input = open(os.path.join(dir, "output/output.txt"), "r")
    htmlParse = input.read().decode("utf-8")

    soup = BeautifulSoup(htmlParse)

    tables = soup.findChildren(["table"])
    tableIDs = [(n["value"]) for n in soup.findChildren("input")]
    del tableIDs[0]

    for c in range (0, len(tables)):
        dictList = tables[c].findChildren(["td"])

        for t in range (0, len(dictList)):
            textList.append(re.sub(' +', ' ', (dictList[t].getText().strip("\t\n\r").replace("\n", "").strip().upper().encode("utf-8"))))

        finalList.append([tableIDs[c].encode("utf-8"), textList])
        textList = []

    file = open(os.path.join(dir, "output/finalList.txt"), "w")
    file.write(str(finalList))

    for c in range (0, len(finalList)):
        file = open(os.path.join(dir, finalList[c][0].replace("output/TABLE-", "")), "w")
        file.write(str(finalList[c][1]))
def parse_itunes_transactions(mail_body):
    soup = BeautifulSoup(mail_body)
    receipt = soup.findChildren('table')[2]
    date = list(receipt.findChildren('td')[2].stripped_strings)[3]
    items = []
    number_of_categories = 5
    i = 0
    transaction_table = soup.findChildren('table')[4]
    transactions = transaction_table.findChildren(['tr'])
    for transaction in transactions:
        cells = transaction.findChildren('td')
        row_values = [account]
        for cell in cells:
            values = list(cell.stripped_strings)
            if len(values) > 0:
                text = values[0].encode(csv_text_encoding)
                if (i > 0):
                    row_values.append(get_itunes_values(len(row_values), text))
                else:
                    row_values.append(text)
        if len(row_values) == number_of_categories:
            if i > 0:
                row_values.append(date)
                items.append(row_values)
            i += 1
    return items
Ejemplo n.º 5
0
    def _crawl_level_2(self, link):
        """

        :rtype: dict
        :type link: str
        """
        response = requests.post(link)
        soup = BeautifulSoup(response.content, "html.parser")
        soup.findChildren()
        comic_name = soup.find(id="breadcrumbs")
        main_section = soup.find(id="wrapper") \
            .find("section", class_="main-content") \
            .div \
            .find("div", class_="col-md-8") \
            .section
        thumbnail = main_section.find("div", class_="thumbnail") \
            .img
        description = main_section.find("div", class_="detail") \
            .find("div", class_="content")
        generic_information = main_section.find("div", class_="description")
        chapters = soup.find(id="list-chapters")

        information = self._get_generic_information(generic_information)
        result = self._get_chapters(chapters)
        chapters_link_list = result.get("chapters_link")
        chapters_name_list = result.get("chapters_name")

        chapter_page_list = list()
        for chapter_link in chapters_link_list:
            chapter_page_list.append(
                CrawlLevel1().crawl("https://blogtruyen.com" + chapter_link))
            pass

        # print(information)
        # print(chapters_link_list)
        # print(chapters_name_list)
        # print(self.get_comic_name(comic_name))
        # print(self.get_thumbnail(thumbnail))
        # print(self.get_description(description))
        # print(chapter_page_list)
        # print(chapters_dict)
        # print("~~~~~~~~~~~~~~~~~~~~~~~~~~")

        # comic_name: #breadcrumbs > span:nth-child(2)
        # thumbnail: #wrapper > section.main-content > div > div.col-md-8 > section > div.thumbnail > img
        # description: #wrapper > section.main-content > div > div.col-md-8 > section > div.detail > div.content
        # generic_information: #wrapper > section.main-content > div > div.col-md-8 > section > div.description
        # chapters: #list-chapters

        return dict(comicname=self._get_comic_name(comic_name),
                    thumbnail=self._get_thumbnail(thumbnail),
                    description=self._get_description(description),
                    authors=information.get("tacgia"),
                    translators=information.get("nhomdich"),
                    genres=information.get("theloai"),
                    finishstatus=information.get("trangthai")[0],
                    chaptersname=chapters_name_list,
                    chapterspage=chapter_page_list)
        pass
Ejemplo n.º 6
0
def parseHTMLtoJSON(htmlText):
    soup = BeautifulSoup(htmlText, "html.parser")
    global caseData
    global headerData
    caseData = []
    headerData = {}
    allTables = soup.findChildren("table")
    storeHeader(allTables[0])
    bodyTables = soup.findChildren("table", {"class": "style3"})
    storeBody(bodyTables)
Ejemplo n.º 7
0
def scrape_stats():
	#urls run from 0 to 402
	url_count = 0
	#src_url = 'http://crimereporting.ncdoj.gov/public/2013/LEPersonnel/LEPerPopRatAgyTrd/leperpopratagytrd/'
	src_url = 'http://crimereporting.ncsbi.gov/public/2014/LEOKillAsslt/LEOAssltWeaAgyTrd/leoassltweaagytrd/'
	src_url_end = '.htm'

	fieldnames = ['agency_id','agency_name','Year', 'Firearm', 'Knife or Other Cutting Instrument','Other Dangerous Weapon','Hands, Fists, Feet, etc.',' Total Officer Assaults']
	writer = csv.DictWriter(open('lea_assaults.csv', 'wb'),fieldnames=fieldnames)
	lea_row = {'agency_id':'','agency_name':'','Year':'', 'Firearm':0, 'Knife or Other Cutting Instrument':0,'Other Dangerous Weapon':0,'Hands, Fists, Feet, etc.':0,' Total Officer Assaults':0}
	writer.writeheader()
	print 'ALERT: New log created...'

	while url_count < 402:
		expect_year = 2005
		print src_url + str(url_count) + src_url_end
		html_file = urllib2.urlopen(src_url + str(url_count) + src_url_end).read()
		soup = BeautifulSoup(html_file, 'html.parser')

		for lea_detail in soup.findChildren('table')[11].findChildren('td'):
			lea_name = lea_detail.string

		data_table = soup.findChildren('table')[12]

		rows = data_table.findChildren('tr')
		header = 1
		header_row = []
		for row in rows:
			cell_count = 0
			lea_row = {'agency_id':url_count,'agency_name':lea_name,'Year':'', 'Firearm':0, 'Knife or Other Cutting Instrument':0,'Other Dangerous Weapon':0,'Hands, Fists, Feet, etc.':0,' Total Officer Assaults':0}
			cells = row.findChildren('td')
			current_year = cells[0].string
			if header == 1:
				for cell in cells:
					header_row.append(cell.string)
				header = 0
			else:
				while int(current_year) != expect_year:
					lea_row = {'agency_id':url_count,'agency_name':lea_name,'Year':expect_year, 'Firearm':0, 'Knife or Other Cutting Instrument':0,'Other Dangerous Weapon':0,'Hands, Fists, Feet, etc.':0,' Total Officer Assaults':0}
					writer.writerow(lea_row)
					expect_year += 1
				for cell in cells:
					try:
						value = int(cell.string)
						lea_row[header_row[cell_count]] = value
					except ValueError:
						pass
					cell_count += 1
				writer.writerow(lea_row)
				expect_year += 1
		while int(expect_year) <= 2014:
			lea_row = {'agency_id':url_count,'agency_name':lea_name,'Year':expect_year, 'Firearm':0, 'Knife or Other Cutting Instrument':0,'Other Dangerous Weapon':0,'Hands, Fists, Feet, etc.':0,' Total Officer Assaults':0}
			writer.writerow(lea_row)
			expect_year += 1
		url_count += 1
Ejemplo n.º 8
0
    def parse_response(self, response):
        """
        Parses Apache serve-status response.
        """
        VHOST_List = []
        REQUEST_URI_List = []
        FULL_URL_List = []
        CLIENT_IP_ADDRESS_List = []

        # URL-related.
        soup = BeautifulSoup(response, 'lxml')
        try:
            table_index_id = 0
            VHOST_index_id = -2
            REQUEST_URI_index_id = -1
            CLIENT_IP_ADDRESS_index_id = -3

            for _ in range(len(soup.findChildren('table')[table_index_id].findChildren('tr'))):
                if _ != 0:
                    try:
                        VHOST = soup.findChildren('table')[table_index_id].findChildren('tr')[_].findChildren('td')[VHOST_index_id].getText()
                    except Exception as e:
                        Exception_Handler(e)
                        VHOST = ''
                    try:
                        REQUEST_URI = soup.findChildren('table')[table_index_id].findChildren('tr')[_].findChildren('td')[REQUEST_URI_index_id].getText().split(' ')[1]
                    except Exception as e:
                        Exception_Handler(e)
                        REQUEST_URI = ''
                    try:
                        if (VHOST == REQUEST_URI == ''):
                            FULL_URL = ''
                        else:
                            FULL_URL = 'http://' + str(VHOST) + str(REQUEST_URI)
                    except Exception as e:
                        Exception_Handler(e)
                        FULL_URL = ''

                    VHOST_List.append(VHOST)
                    REQUEST_URI_List.append(REQUEST_URI)
                    FULL_URL_List.append(FULL_URL)

                    # Client-related.
                    try:
                        CLIENT_IP_ADDRESS = soup.findChildren('table')[table_index_id].findChildren('tr')[_].findChildren('td')[CLIENT_IP_ADDRESS_index_id].getText()
                    except:
                        CLIENT_IP_ADDRESS = ''

                    CLIENT_IP_ADDRESS_List.append(CLIENT_IP_ADDRESS)

        except Exception as e:
            Exception_Handler(e)
            pass
        output = {"VHOST": VHOST_List, "REQUEST_URI": REQUEST_URI_List, "FULL_URL": FULL_URL_List, "CLIENT_IP_ADDRESS": CLIENT_IP_ADDRESS_List}
        return(output)
Ejemplo n.º 9
0
    def scrape_page(self, page_request: Future) -> Generator[Vod, None, None]:
        page_content = page_request.result().content
        page_strainer = SoupStrainer("table")
        page_soup = BeautifulSoup(page_content,
                                  "lxml",
                                  parse_only=page_strainer)

        vod_requests = [
            self.request(tr.findChild("a")["href"])
            for tr in page_soup.findChildren("tr")
        ]

        for table in page_soup.findChildren(recursive=False):
            date = table.caption.span.getText()
            for i, row in enumerate(table.tbody.findChildren(recursive=False)):
                cells = row.findChildren(recursive=False)

                try:
                    vod_id = re.search(r".*\/(.*)",
                                       cells[1].a["href"]).group(1)

                    try:
                        best_of = int(
                            re.search(r"Bo([\d]*)",
                                      cells[3].getText()).group(1))
                    except AttributeError:
                        continue

                    players = []
                    player = Vod.Player("Unknown", [])
                    for tag in cells[1].a.span.findChildren(recursive=False):
                        if tag.name == "b":
                            if len(player.characters) != 0:
                                players.append(player)
                                player = Vod.Player("Unknown", [])
                            player.alias = tag.getText()
                        elif tag.name == "img":
                            player.characters.append(
                                guess_character(tag["src"][24:-4]))
                    players.append(player)

                    video_ids, casters = self.scrape_vod_page(
                        vod_id, vod_requests[i])

                    tournament = re.search(r"[^\s].*[^\s]",
                                           cells[0].getText()).group()
                    _round = re.search(r"[^\s].*[^\s]",
                                       cells[4].getText()).group()

                    yield Vod(vod_id, video_ids, date, tournament, players,
                              casters, _round, best_of)
                except InvalidVideoError as e:
                    if self.verbose:
                        print(e, file=sys.stderr)
Ejemplo n.º 10
0
    def import_from_file(path_to_html):
        if not os.path.isfile(path_to_html):
            sys.stdout.flush()
            err = f"Could not open doc file '{path_to_html}': No such file or directory."
            raise FileNotFoundError(err)

        with open(path_to_html, "r") as file:
            html_content = file.read()

        soup = BeautifulSoup(html_content, "html5lib")

        headers = soup.findChildren("h1")
        tables = soup.findChildren("table")
        assert len(headers) == len(tables)

        reqs_array_array = []
        for reqs_table in soup.findChildren("table"):
            reqs = ConfluenceHTMLTableImport.parse_table(reqs_table)
            reqs_array_array.append(reqs)

        document = Document(None, "Imported Doc", None, [], [])
        for section_idx, reqs in enumerate(reqs_array_array):
            section_name = headers[section_idx].text
            section = Section(document, 1, section_name, [], [])
            document.section_contents.append(section)
            for req in reqs:
                uid = req["UID"]
                title = req["TITLE"]
                statement = req["STATEMENT"]
                rationale = req["RATIONALE"]
                comment = req["COMMENT"]
                sreq = Requirement(
                    section,
                    None,
                    statement,
                    uid,
                    None,
                    None,
                    None,
                    title,
                    None,
                    None,
                    rationale,
                    [RequirementComment(None, None, comment)]
                    if comment
                    else [],
                    None,
                )
                sreq.ng_level = 2
                section.section_contents.append(sreq)

        return document
def execution(list1, id1):

    for domain_name in list1:

        try:
            driver = webdriver.Firefox()
            driver.get("http://www.dmoz.com")
            name = ''
            if 'http://' in domain_name:
                name = domain_name[7:]
            else:
                name = domain_name
            elem = driver.find_element_by_name("q")
            elem.send_keys(domain_name)
            elem.send_keys(Keys.RETURN)
            soup = BeautifulSoup(driver.page_source)
            c = soup.findChildren("ol", {"class": "dir"})
            str1 = ""
            for item in c:
                for link in item.find_all('a'):
                    str1 = str1 + link.get('href') + '\n'
            d = soup.findChildren("ol", {"class": "site"})
            flag = 0
            for i in d:
                for link in i.find_all('a'):
                    x = link.get('href')
                    if domain_name in x or x in domain_name:
                        flag = 1
                        path1 = 'D://Thesis//data//domain_name//sources_in_dmoz//'
                        if len(str1) > 0:
                            f = open(path1 + name + '.txt', 'w')
                            f.write(str1)
                            f.close()
                            print 'completed', domain_name, id1
                            break
                if flag == 1:
                    break
            if flag == 0:
                path2 = 'D://Thesis//data//domain_name//source_not_in _dmoz//unavailable_sources.txt'
                f1 = open(path2, 'a+')
                f1.write(domain_name + '\n')
                f1.close()
                print 'not found in dmoz', domain_name, id1
            #print 'completed ',domain_name,id1
            driver.close()
        except:
            path3 = 'D://Thesis//data//domain_name//source_not_in _dmoz//invalid1.txt'
            f2 = open(path3, 'a+')
            f2.write(domain_name + '\n')
            f2.close()
            print 'url not valid', domain_name, id1
Ejemplo n.º 12
0
Archivo: xls.py Proyecto: zurez/scrap
def write(f):
	data=f.read()
	soup=BeautifulSoup(data,"lxml")
	worksheet= workbook.add_worksheet()
	title= soup.find("h3",{"class":"sectionTitle"}).contents[0]
	if title!=None:

		worksheet.write(0,0,title)
	table= soup.findChildren("table")
	rows = soup.findChildren(["th","td"])
	b=0
	

	try:
		i=0
		while i<len(rows):


		# for i in xrange(len(rows)):
			# print rows[i].get("class")
			
			if rows[i].get("class")[0]=="groupHead":
				# print "Heading: " + rows[i].text
				worksheet.write(b+1,0,rows[i].text)
				b+=1
				i+=1
				
				
				# rows.remove(rows[i])
			elif rows[i].get("class")[0]=="specsKey":
				worksheet.write(b+1,0,rows[i].text)
				worksheet.write(b+1,1,rows[i+1].text)
				# print rows[i].text + ":" + rows[i+1].text
				# rows.remove(rows[i])

				# rows.remove(rows[i+1])
				b+=1
				i+=2
			elif rows[i].get("class")[0]=="specsValue":
				# print "Value:" + rows[i].text
				worksheet.write(b+1,0,rows[i].text)
				b+=1
				i+=1
			else:
				print "Un" + rows[i].text

	except Exception as e:
		print "Filename: " +i
		print e
		workbook.close()
Ejemplo n.º 13
0
def scrape_stats():
	#urls run from 0 to 605
	url_count = 0
	src_url = 'http://crimereporting.ncsbi.gov/public/2014/LEPersonnel/LEPerPopRatAgyTrd/leperpopratagytrd/'
	src_url_end = '.htm'

	fieldnames = ['agency_id','Agency Name','Year','Reporting Status','Fulltime Male Sworn','Fulltime Female Sworn','Fulltime Male Civilian','Fulltime Female Civilian','Total Employees','Population Coverage','Sworn Rate per 1,000 Population']
	writer = csv.DictWriter(open('lea_personnel.csv', 'wb'),fieldnames=fieldnames)
	lea_row = {'agency_id':'','Agency Name':'','Year':'','Reporting Status':'','Fulltime Male Sworn':0,'Fulltime Female Sworn':0,'Fulltime Male Civilian':0,'Fulltime Female Civilian':0,'Total Employees':0,'Population Coverage':0,'Sworn Rate per 1,000 Population':0}
	writer.writeheader()
	print 'ALERT: New log created...'

	while url_count < 605:
		expect_year = 2005
		print src_url + str(url_count) + src_url_end
		html_file = urllib2.urlopen(src_url + str(url_count) + src_url_end).read()
		soup = BeautifulSoup(html_file, 'html.parser')

		for lea_detail in soup.findChildren('table')[11].findChildren('td'):
			lea_name = lea_detail.string

		data_table = soup.findChildren('table')[12]

		rows = data_table.findChildren('tr')
		header = 1
		header_row = []
		for row in rows:
			cell_count = 0
			cells = row.findChildren('td')
			if header == 1:
				for cell in cells:
					header_row.append(cell.string)
				header = 0
			else:
				while int(cells[1].string) != expect_year:
					lea_row = {'agency_id':url_count,'Agency Name':cells[0].string,'Year':expect_year,'Reporting Status':'','Fulltime Male Sworn':'','Fulltime Female Sworn':'','Fulltime Male Civilian':'','Fulltime Female Civilian':'','Total Employees':'','Population Coverage':'','Sworn Rate per 1,000 Population':''}
					writer.writerow(lea_row)
					expect_year += 1
				lea_row = {'agency_id':url_count,'Agency Name':cells[0].string,'Year':'','Reporting Status':'','Fulltime Male Sworn':0,'Fulltime Female Sworn':0,'Fulltime Male Civilian':0,'Fulltime Female Civilian':0,'Total Employees':0,'Population Coverage':0,'Sworn Rate per 1,000 Population':0}
				for cell in cells:
					try:
						value = int(cell.string.replace(',',''))
						lea_row[header_row[cell_count].lstrip()] = value
					except ValueError:
						if cell.string == 'Does Not Participate' or cell.string == 'Reporting':
							lea_row[header_row[cell_count].lstrip()] = cell.string
					cell_count += 1
				writer.writerow(lea_row)
				expect_year += 1
		url_count += 1
Ejemplo n.º 14
0
def extract_code(url, file_name):
    print 'file : ', url
    global file_count
    # if(file_count>=50):
    # 	sys.exit
    page = urllib2.urlopen(url)
    soup = BeautifulSoup(page, 'xml')
    file = open('codes/' + file_name, 'w')
    file_count += 1
    # repcontent=(soup.find(class_="repository-content")).find_all('table')
    # for row in soup.find(class_="repository-content").find_all("tr"):
    # 	print(row,'\n')
    # date=soup.find_all(datetime=True)
    try:
        urllib2.urlopen(url)
        driver.get(url)
        sleep(6)
        date = soup.findChildren(['relative-time'])
        # print("Date is ",date)
        if (date != []):
            datetime = date[0]['datetime']
            # print "Date : ",datetime
        else:
            datetime = 'Date Not Available'
        tables = soup.findChildren('table')
        # print(tables)
        my_table = tables[0]
        rows = my_table.findChildren(['th', 'tr'])

        line = 0
        for row in rows:
            # print(line)
            line += 1
            for string in row.stripped_strings:
                string = (unicode(string)).encode('utf-8')
                # print(string)
                file.write(string)
                file.write(' ')
                # spans=cells[1].findChildren('span')
                # for span in spans:
                # print(span.string)
            file.write('\n')
        label_file.write('codes/' + str(file_count - 1) + '.txt ' + url +
                         '\t' + (unicode(datetime)).encode('utf-8') + '\n')
        file.close()
        return datetime
    except Exception as inst:
        print("Error : ", inst)
        file_count -= 1
        return -1
Ejemplo n.º 15
0
def get_points_bw(self, m, p, s, rs):
    try:
        # BROWSE
        logging.info('Loading Browser')
        b = new_browser()
        b.open(s.page)

        b.form = list(b.forms())[0]
        b[s.form_user] = m.username
        b[s.form_pass] = p #m.password
        b.submit()
        
        html = b.open('/mt/www.bestwestern.com/rewards/').read()
        b.close()

        # TRAVERSE
        logging.info('Traversing')
        soup = BeautifulSoup(html).find(id=s.match)

        rs['content'] = soup.findChildren()[1]
        rs['points'] = rs['content'].div('div')[1].contents[1][2:]
        rs['success'] = True

    except Exception, e:
        logging.info(e)
        rs['success'] = False
Ejemplo n.º 16
0
def get_blurb(first, last, sport, player_url=None):
    # for some weird reason its actually better to omit the first name in the search form
    response = get(player_url if player_url else blurb_search_url.format(first="", last=last, sport=sport))
    soup = BeautifulSoup(response.text, 'html.parser')
    # did we land a result page?
    if not soup.findChild('div', class_='RW_pn'):
        name_map = {}
        results_table = soup.find('table', attrs={'id':'cp1_tblSearchResults'})
        # filter results, omitting duplicate "position" links that don't include the player's name
        filtered_results = results_table.findChildren(lambda tag: tag.name == 'a' and 'player' in tag['href'] and len(tag.text) > 3)
        if not filtered_results:
            raise NoResultsError("No results for %s %s" % (first, last))
        else:
            for result in filtered_results:
                name = " ".join(result.text.split())
                name_map[result] = SequenceMatcher(None, first + " " + last, name).ratio()
        # sort names by similarity to search criteria
        sorted_names = sorted(name_map, key=name_map.get, reverse=True)
        return get_blurb(first, last, sport, player_url='http://www.rotoworld.com' + sorted_names[0].get('href'))
    else:
        news = soup.findChildren('div', class_='playernews')
        if news:
            recent_news = news[0]
            report = recent_news.find('div', class_='report')
            impact = recent_news.find('div', class_='impact')
            blurb = report.text + '\n\n' + impact.text
            return blurb
        else:
            raise NoResultsError("No recent player news for %s %s" % (first, last))
Ejemplo n.º 17
0
def parse_day(canteen, url):
    content = urlopen(url).read()
    data = BeautifulSoup(content.decode('utf-8'), 'xml')

    for group in data.findChildren('group'):
        date = group['productiondate']
        category = group.findChild('name').getText()
        prices = parse_prices(group.findChild('prices').findChildren('price'))

        components = group.findChild('components').findChildren('component')
        components = [ c.findChild("name1").getText() for c in components ]

        tags = group.findChild('taggings').findChildren('tagging')
        tags = [ t.getText() for t in tags if not t.is_empty_element ]

        if '1' == group['type']:
            # meal consisting of multiple parts, use first component as name

            if len(components) < 1:
                print("meal without component: {}".format(group))
                continue

            notes = components[1:] + tags
            canteen.addMeal(date, category, components[0], notes, prices)
        elif '2' == group['type']:
            # multiple components to choose from

            for component in components:
                canteen.addMeal(date, category, component, tags, prices)
        else:
            print('unknown meal type: {}'.format(group['type']))
Ejemplo n.º 18
0
def query_profile(url, first_pass=False):
    html = urlopen(url).read()
    soup = BeautifulSoup(html)
    if html.find("Sorry, no content found for this URL") > 0:
        return 404, "", "", "", "", ""
    else:
        tables = soup.findChildren("table")
        pub_table = tables[1]
        stats_table = tables[0]

        if html.find("There are no articles in this profile.") > 0:
            return None

        elif first_pass == True:
            scholar = soup.find("div", {"id": "gsc_prf_i"})
            name = str(scholar.find("div", {"id": "gsc_prf_in"}).get_text())
            # try:
            institution = str(scholar.find("div", {"class": "gsc_prf_il"}).get_text())
            # except:
            # 	institution = str(scholar.find('div', {'class':'gsc_prf_il'}).get_text())
            # # interests = ",".join([i.get_text() for i in scholar.find_all(id='gsc_prf_ila')])
            # interests=[]
            interests = scholar.find_all("div", {"class": "gsc_prf_il"})[1]
            # # for a in interests:
            # # 	interests.append(a.get_text())
            # try:
            # 	interests=interests.get_text()
            # except:
            interests = ", ".join([i.get_text() for i in interests.find_all("a", {"class": "gsc_prf_ila"})])
            email = str(scholar.find_all("div", {"class": "gsc_prf_il"})[2].get_text())
            return name, institution, parse_pubs(pub_table), parse_stats(stats_table), interests, email

        else:
            return parse_pubs(pub_table)
Ejemplo n.º 19
0
    def run_crawl(self):
        start = time.time()
        domains = []
        url = 'http://cybercrime-tracker.net/ccam.php'
        source = 'cybercrime-tracker.net'
        _info = self.get(url=url)

        if _info is None:
            self.logger.warning("request returned None   " + source)
            return None
        soup = BeautifulSoup(_info, 'lxml')
        table = soup.findChildren('tbody')[2]
        rows = table.findChildren('tr', attrs={'class': 'monitoring'})
        for row in rows:
            date_str = row.findChildren('td')[1].string
            time_obj = time.strptime(date_str, "%d/%m/%Y %H:%M:%S")
            updatetime = time.strftime("%Y-%m-%d", time_obj)
            domain = row.findChildren('td')[2].string
            hashstr = row.findChildren('td')[3].string
            if self.is_ip(domain): continue
            block = [domain, updatetime, source]
            domains.append(block)
        stop = time.time()
        crawl_time = str(stop - start) + "秒"
        self.save_info(domains, source, crawl_time)
Ejemplo n.º 20
0
def getNews():
    page = urllib2.urlopen("https://www.inshorts.com/en/read")
    soup = BeautifulSoup(page, 'lxml')
    news_image = soup.findChildren('div', {"class": "news-card-image"})
    news_title = soup.findAll('div',
                              {'class': 'news-card-title news-right-box'})
    news_read_more = soup.findAll("div", {"class": "read-more"})
    news_body = soup.findAll("div", {"itemprop": "articleBody"})
    news_readmore = []
    news_img = []
    news_tit = []
    news_con = []
    for x in range(len(news_image)):
        news_img.append(re.findall("url\((.*)\)", news_image[x]['style'])[0])
        news_tit.append(re.split("\\n\n", news_title[x].text)[1])
        news_con.append(news_body[x].text)
        news_readmore.append(news_read_more[x].findChildren())

    news_readmore = news_readmore[0:3]
    news_img = news_img[0:3]
    news_tit = news_tit[0:3]
    news_con = news_con[0:3]
    newshtml = ""
    for i in range(3):
        newshtml += """<div style="background-color:#FFFFFF;color:#000000;padding:15px">
                        <img src = %s, align = "left", width = "171", height = "128", style = "padding:15px"></img></div>
                        <div style = "display:inline"><h4>%s</h4>
                        <p style = "font-size:12px;padding-left:15px">%s</p>
                        <p style = "font-size:8px;color:#A9A9A9;padding-left:15px">read more at%s</p></div><br>""" % (
            news_img[i], news_tit[i], news_con[i], news_readmore[i])

    return (newshtml)
def main():
    url = 'http://www.ieee.org/conferences_events/conferences/search/index.html?KEYWORDS=&CONF_SRCH_RDO=conf_date&RANGE_FROM_DATE=&RANGE_TO_DATE=&REGION=Region10-Asia+and+Pacific&COUNTRY=Bangladesh&RowsPerPage=10&PageLinkNum=10&ActivePage=1&SORTORDER=desc&SORTFIELD=start_date'
    content = urlopen(url)
    soup = BeautifulSoup(content, 'lxml')
    conference_table = soup.findChildren('table', class_='nogrid-nopad')
    rows = conference_table[0].findChildren('td', class_='pad10')

    events = []

    for row in rows:
        event = row.find_all('p')
        for info in event:
            events.append(get_text(str(info)))

    label = [
        "Event title: ", "Date of Submissions:", "Event Date:",
        "Event Location:"
    ]

    extra_decoration = 0

    print("*" * 60, "\n")

    for lab, event in zip(label * len(events), events):
        print(lab, event, end="\n")
        extra_decoration += 1

        if extra_decoration == 4:
            print("\n", "*" * 60, "\n")
            extra_decoration = 0
Ejemplo n.º 22
0
def scrape_leaderboard(url):
    request = urllib2.Request(url)
    page = urllib2.urlopen(request)
    content = page.read()
    soup = BeautifulSoup(content)
    soupTable = soup.findChildren('table',{'class':"leaderboard-table"})
    headers = soupTable[0].findChildren('th')
    rows = soupTable[0].findChildren('tr')
    rows = filter(lambda r: len(r) > 5, rows)
    tableheaders = [i.text.replace(' ','') for i in headers]
    cutplace = 1000
    cutcheck = True

    row_details = []
    for i, row in enumerate(rows):
        tt = [j.find("a", {"class":"full-name"}).text if j.get("class")[0] == "playerName" else j.text for j in row.findAll('td')]
        if len(tt) > 5:
            if cutcheck and tt[tableheaders.index("POS")] == "-":
                cutplace = i + 5
                cutcheck = False
                tt[tableheaders.index("POS")] = cutplace
            elif tt[tableheaders.index("POS")] == "-":
                tt[tableheaders.index("POS")] = cutplace
            if "T" in str(tt[tableheaders.index("POS")]):
                tt[tableheaders.index("POS")] = tt[tableheaders.index("POS")][1:]
            try:
                tt[tableheaders.index("POS")] = int(tt[tableheaders.index("POS")])
            except ValueError:
                tt[tableheaders.index("POS")] = cutplace
            try:
                row_details.append(leaderboard(*tt))
            except:
                continue
    return pd.DataFrame(row_details)
Ejemplo n.º 23
0
 def parseNoteFirst(self, text=None, infile=None):
     """Parse NoteFirst record (xml format), return self"""
     if isinstance(text, basestring):
         pass
     elif isinstance(infile, basestring):
         f = open(infile)
         text = f.read()
         f.close()
     elif isinstance(infile, file):
         text = infile.read()
     else:  # Do nothing
         return None
     soup = BeautifulSoup(text, "html.parser")
     self.title = soup.primarytitle.text
     doi = soup.doi.text
     self.doi = doi[doi.find("10.") :]
     self.journal = soup.media.info.text
     self.year = soup.year.text
     self.volume = soup.volume.text
     self.issue = soup.issue.text
     self.pages = soup.pagescope.text
     authors = soup.findChildren("fullname")
     self.authors = [author.info.text for author in authors]
     # self.issn=""
     return self
Ejemplo n.º 24
0
def table_parse(page) :
	data = []
	soup = BeautifulSoup(page)
	table = soup.findChildren('table')[0]
	heads = table.findChildren(['th'])
	rows = table.findChildren(['tr'])
	rows.pop(0) # Pop first row when it is the head
	for row in rows :
		tmp = [] # organized data record
		cells = row.findChildren('td')
		# cells.pop(0) # Pop first cell when there is special flag/symbol/space
		for cell in cells :
			if cell.string is not None :
				value = cell.string
				value = value.replace('\\n\\t', '')
				value = value.replace('\\n', '')
				value = value.strip()
				tmp.append(value)
			else :
				value = cell.findChildren('a')[0].string
				value = value.replace('\\n\\t', '')
				value = value.replace('\\n', '')
				value = value.strip()
				tmp.append(value)
				url = cell.find('a').get('href')
				tmp.append(url)

		if tmp[3] == 'United States' :
			country = 'USA'
		else :
			country = tmp[3]
		data.append([tmp[0], tmp[1]] + location_parse(tmp[4]) + [country, tmp[2]])
	return data
Ejemplo n.º 25
0
def get_categories(web_data):
    """Put items into readable nested dict, and return for building the menu"""

    soup = BeautifulSoup(web_data, 'html.parser')

    #xbmc.log("soup: {0}.".format(soup),level=xbmc.LOGERROR)

    ##Grab only the section list items:
    children = soup.findChildren("li")

    my_dict = {}

    for child in children:
        try:
            if child['class'][0] == "_depth0":

                key0 = child.text.lstrip(" ").encode('utf-8')

                my_href = child.find("a")["href"]
                if len(my_href) > 0:
                    my_dict[key0] = my_href
                else:
                    my_dict[key0] = {}

            elif child['class'][0] == "_depth1":

                key1 = child.text.lstrip(" ").encode('utf-8')

                my_href = child.find("a")["href"]
                if len(my_href) > 0:
                    my_dict[key0][key1] = my_href
                else:
                    my_dict[key0][key1] = {}

            elif child['class'][0] == "_depth2":
                key2 = child.text.lstrip(" ").encode('utf-8')

                my_href = child.find("a")["href"]
                if len(my_href) > 0:

                    my_dict[key0][key1][key2] = my_href
                else:
                    my_dict[key0][key1][key2] = {}

            elif child['class'][0] == "_depth3":
                key3 = child.text.lstrip(" ").encode('utf-8')

                my_href = child.find("a")["href"]
                if len(my_href) > 0:

                    my_dict[key0][key1][key2][key3] = my_href
            else:
                pass

        except:
            pass

    #xbmc.log("My_dict: {0}".format(my_dict),level=xbmc.LOGERROR)

    return my_dict
Ejemplo n.º 26
0
    def get_images(self, url, headers):
        """ Collect all the urls, iterate over them, downloading every one.""" 
        res = requests.get(url=url, headers=headers)
        

        if not res.status_code == requests.codes.ok:            
            raise res.raise_for_status()

        content = res.content
        soup = BeautifulSoup(content)
        link_tags = soup.findChildren(attrs={'class':'item view album-view-image-link'})
        
        for elem in link_tags:
            url = elem.find('a').get('href')                    
            self.urls.append(url)   

        folder = 'Imgur_Album'

        # if not os.path.exists(folder):
        #   os.makedirs(folder)         # mkdir -p
        # The above way is bad, because a dir can be created between the 2 function calls, thus causing a race condition
        
        # better way:
        try:
            os.makedirs(folder)
        except OSError, e:
            if e.errno == errno.EEXIST:
                print "Directory Already exists."
                print "Download to existing directory?"
                input = raw_input("[y/n]")
                if not input.lower() == 'y':
                    print "Rename, or modify the directory name in the program."
                    sys.exit()
Ejemplo n.º 27
0
    def generate_html(self, url, status_url, last_checked_time):
        with open("AppView.html") as inf:
            txt = inf.read()
            soup = BeautifulSoup(txt, "html.parser")
            #print(soup.prettify())
        new_tr = soup.new_tag('tr')
        new_td_url = soup.new_tag('td')
        new_td_url.append(soup.new_string(url))
        new_td_status_url = soup.new_tag('td')
        new_td_status_url.append(soup.new_string(status_url))
        new_td_last_checked_time = soup.new_tag('td')
        new_td_last_checked_time.append(soup.new_string(last_checked_time))
        # insert it into the document
        new_tr.append(new_td_url)
        new_tr.append(new_td_status_url)
        new_tr.append(new_td_last_checked_time)

        old_tr = soup.findChildren('tr')
        for tr in old_tr:
            old_td = tr.findChildren('td')
            url_string = old_td[0].getText()
            if url_string != '':
                if url_string == url:
                    soup.table.tr.replaceWith(new_tr)
                else:
                    soup.table.append(new_tr)

            else:
                soup.table.tr.replaceWith(new_tr)


        # save the file again
        with open("AppView.html", "w") as outf:
            outf.write(str(soup))
def extract_workshops(url):
    """
    Extracts all information available for workshops provided at
    https://coling2020.org/pages/workshops
    :return: list of dictionaries with a workshop represented as one dictionary.
    """
    workshops = []
    # url = "https://coling2020.org/pages/workshops"

    try:
        page = request.urlopen(url)
    except:
        print("Could not connect to url.")

    soup = BeautifulSoup(page, 'html.parser').find("section",
                                                   {"id": "main_content"})

    for child in soup.findChildren('h3'):
        for i in child.findNext('ul').find_all('li'):
            workshop = {
                attribute: None
                for attribute in [
                    "workshop_name", "workshop_organizer",
                    "workshop_description", "workshop_day",
                    "workshop_location", "workshop_link"
                ]
            }
            workshop['workshop_day'] = child.text
            workshop['workshop_name'] = util.basic_string_clean(
                i.find('a').text)
            workshop['workshop_link'] = i.find('a')['href']
            workshops.append(copy.copy(workshop))

    # print(json.dumps(workshops, indent=1))
    return workshops
Ejemplo n.º 29
0
def calculateLineTagRatio(line):
    """
    Calculation of the line's tag ratio.
    :param line: the line containing HTML data
    :return: the text-to-tag ratio is returned
    """

    soup = BeautifulSoup(line, "html.parser")

    tags = []
    non_tag_data = ""

    # Would loop through all the children of the HTML text
    for tag in soup.findChildren():

        # Would append the tag to the tags list
        tags.append(tag.name)

        # If the tag has content as an immediate descentent, add to the non tag data
        if len(tag.contents) == 1:
            # child is HML content
            if isinstance(tag.contents[0], basestring):
                non_tag_data += tag.contents[0]

    # Compute the number of tags seen
    tag_count = len(tags)

    # Computation of TTR for the line
    if tag_count == 0:
        return len(non_tag_data)
    else:
        return len(non_tag_data) / tag_count
Ejemplo n.º 30
0
def CymathFunction():
    sitedemo = "https://www.cymath.com/answer?q=sin(x)%3D24"
    site = formaturlexpr(uinput, "https://www.cymath.com/answer?q=", "cymath")

    browser.set_window_size(1120, 550)
    browser.get(site)
    browser.save_screenshot('cymathscreenshot.png')

    html = browser.page_source
    soup = BeautifulSoup(html, 'html.parser')

    stepsdivlist = soup.find_all(id="steps_div")
    itnlist = soup.find_all(class_='itn')
    katexlist = soup.find_all(class_='katex')
    listmord = soup.find_all(class_='mord mathrm')
    sollist = soup.findChildren(class_='base textstyle uncramped')
    hiddenanswers = soup.find(id="answer")

    print("CYMATH")

    hiddenanswertext = hiddenanswers.get_text()
    # print(hiddenanswertext)
    hat1 = hiddenanswertext.replace('),sequence(', ' & ')
    hat2 = hat1.replace('sequence(', ' ')
    hat3 = hat2[:-1]
    hat4 = hat3.replace('PI', 'π')
    hiddenanswertext = hat4
    print(hiddenanswertext)
Ejemplo n.º 31
0
    def getFacultyAdvisor(self, faculty_advisor):

        #opening faculty advisor details page
        self.br.open(
            "https://academics.vit.ac.in/student/faculty_advisor_view.asp")
        response = self.br.open(
            "https://academics.vit.ac.in/student/faculty_advisor_view.asp")

        #getting the soup
        soup = BeautifulSoup(response.get_data())

        #extracting tables
        tables = soup.findChildren('table')
        myTable = tables[1]
        rows = myTable.findChildren(['th', 'tr'])

        #extracting data
        for row in rows:

            #creating thread for each row
            thrd = myThread(row, 5, faculty_advisor)
            #starting the thread
            thrd.start()

            #appending into thread list
            threads.append(thrd)

        #waiting for each thread to complete
        for t in threads:
            t.join()

        #returning faculty_advisor
        return faculty_advisor
Ejemplo n.º 32
0
def collect_content():
    final = []
    URL = 'https://www.mohfw.gov.in/'
    response = requests.get(URL).content
    soup = BeautifulSoup(response, "html.parser")
    table = soup.findChildren('table')
    global needed_table
    needed_table = table[0]
    all_rows = needed_table.find_all('tr')
    for row in all_rows:
        stats_row = []
        stats_row.append(row.find_all('td'))
        for spec_row in stats_row:
            ans = []
            for stats in spec_row:
                ans.append(stats.string)
            final.append(ans)
    final.pop(0)
    print(final)
    cur_data = {x[0]: {current_time: x[1:]} for x in final}
    past_data = load()

    if past_data != cur_data:
        mail.create_mail(needed_table, "",
                         main.name_email()[0],
                         main.name_email()[1])
        save(cur_data)

    else:
        print(f"No update at {current_time}")
 def process_one(self, bundle, index):
     page = self.session.get(self.bundles[bundle]).text
     listpage = BeautifulSoup(page, 'html.parser')
     gamerow = listpage.findChildren('div', attrs={'class':
                                                   'game_row'})[index]
     imageurl = gamerow.findChild('div', attrs={
         'class': 'game_thumb'
     }).get('data-background_image')
     gamename = gamerow.findChild('h2', attrs={
         'class': 'game_title'
     }).getText()
     gamepage = gamerow.findChild('a').get('href')
     linux = False
     mac = False
     windows = False
     if gamerow.findChild('span', attrs={'class': 'icon icon-tux'}):
         linux = True
     if gamerow.findChild('span', attrs={'class': 'icon icon-apple'}):
         mac = True
     if gamerow.findChild('span', attrs={'class': 'icon icon-windows8'}):
         windows = True
     self.cache_game(gamename,
                     imageurl=imageurl,
                     downloadpage=gamepage,
                     linux=linux,
                     windows=windows,
                     mac=mac)
	def getFacultyAdvisor(self, faculty_advisor):

		#opening faculty advisor details page
		self.br.open("https://academics.vit.ac.in/student/faculty_advisor_view.asp")
		response = self.br.open("https://academics.vit.ac.in/student/faculty_advisor_view.asp")

		#getting the soup
		soup = BeautifulSoup(response.get_data())

		#extracting tables
		tables = soup.findChildren('table')
		myTable = tables[1]
		rows = myTable.findChildren(['th','tr'])

		#extracting data
		for row in rows:

			#creating thread for each row
			thrd = myThread(row, 5, faculty_advisor)
			#starting the thread
			thrd.start()

			#appending into thread list
			threads.append(thrd)
		
		#waiting for each thread to complete
		for t in threads:
			t.join()

		#returning faculty_advisor
		return faculty_advisor
Ejemplo n.º 35
0
    def parse_text_block(self, bs_textblock: BeautifulSoup):

        assert (bs_textblock.name == "p"
                and bs_textblock.attrs["blocktype"] == "Text"
                or bs_textblock.name == 'td')
        textblock = Abby(bs_textblock.attrs, AbbyType.P)

        els = bs_textblock.findChildren(recursive=False)
        current_word: Abby = Abby({}, AbbyType.WORD)
        current_line: Abby = Abby({}, AbbyType.LINE)
        for i in range(len(els)):
            el = els[i]
            if el.name == 'br':  # Line break
                current_line.add_child(current_word)
                textblock.add_child(current_line)
                current_word = Abby({}, AbbyType.WORD)
                current_line = Abby({}, AbbyType.LINE)
            elif el.text == " " or el.text == "\n":  # Space / Word end
                current_line.add_child(current_word)
                current_word = Abby({}, AbbyType.WORD)
            elif el.name == 'span':  # Character
                char_attributes = el.attrs
                char_attributes["text"] = el.text
                char = Abby(char_attributes, AbbyType.CHAR)
                current_word.add_child(char)
            else:
                raise ("CANNOT PARSE UNKNOWN TYPE")

        return textblock
	def getAttendance(self, attendance):

		#opening the attendance page
		self.br.open("https://academics.vit.ac.in/student/attn_report.asp?sem=WS&fmdt=09-Jul-2015&todt=%(to_date)s" % {"to_date" : today })
		response = self.br.open("https://academics.vit.ac.in/student/attn_report.asp?sem=WS&fmdt=09-Jul-2015&todt=%(to_date)s" % {"to_date" : today })
		soup = BeautifulSoup(response.get_data())

		#extracting tables
		tables = soup.findChildren('table')
		myTable = tables[3]
		rows = myTable.findChildren(['th','tr'])
		rows = rows[1:]
		i = 1


		#extracting data
		for row in rows:

			#creating thread for each row
			thrd = myThread(row, 2, attendance, i, self.br)
			#starting the thread
			thrd.start()

			#appending into thread list
			threads.append(thrd)

			i = i+1 
		
		#waiting for each thread to end
		for t in threads:
			t.join()

		return attendance
Ejemplo n.º 37
0
def getNewData(thisURL):
    # reads XML file, converts to pandas dataFrame. Each row is one station.
    cabiBase = requests.get(thisURL)
    cabiSoup = BeautifulSoup(cabiBase.content,"lxml")
    CC = cabiSoup.findChildren()
    fnSoup = [x.name for x in CC]
    sta = cabiSoup.findAll('station')
    allContents = [x.contents for x in sta]
    fieldsHere = [[re.search('(?<=\<)\w+(?=>)',str(entry)).group(0) \
                for entry in x] for x in allContents]
    valuesHere = [[re.sub('&amp;','&',re.search('(?<=>)[^\<]*(?=\<)',str(entry)).group(0)) \
                             for entry in x] for x in allContents]              
    dNew = {}
    for ff in range(len(fieldsHere[0])):    # assumes they're all identical!
        thisField = fieldsHere[0][ff]
        thisType = getDtype(thisField)
        try:
            dNew.update({thisField:[thisType(x[ff]) for x in valuesHere]})
        except:
            temptemp = [x[ff] for x in valuesHere]
            temp2 = [thisType(x) if (len(x)) else -999 for x in temptemp]
            dNew.update({thisField:temp2})            
    overall_LastUpdate_sec = [int(CC[fnSoup.index('stations')].attrs['lastupdate'])/sec_2_msec]*(len(sta))
    zipIt = zip([1000000*OLU for OLU in overall_LastUpdate_sec],dNew['id'])
    DF = pd.DataFrame(dNew,index=[sum(zz) for zz in zipIt])
    return [DF,(cabiBase.content)]
Ejemplo n.º 38
0
    def scrape(self, url):
        """
            Here's the general algorithm...
            - perform these actions for EACH <table> within <body>
            - hit each <tr class="evenColor">
        """
        events = []
        soup = BeautifulSoup(urllib2.urlopen(url).read())

        # this .findChildren is giving off way too many false positives. must be a better call here. [todo]
        tables = soup.findChildren("table")
        data_tables = []
        for table in tables:
            if table.findParent("table") is None:
                data_tables.append(table)
        print "count: ", len(data_tables)
        print "other count: ", len(tables)

        # parse out individual rows.
        for table in data_tables:
            rows = table.findAll("tr", {"class": "evenColor"})
            print "row count: ", len(rows)
            for row in rows:
                event = self.parseEventRow(row)
                print event
                events.append(event)
                #pdb.set_trace()

        return events
Ejemplo n.º 39
0
    def scrape(self, url):
        """
            Here's the general algorithm...
            - perform these actions for EACH <table> within <body>
            - hit each <tr class="evenColor">
        """
        events = []
        soup = BeautifulSoup(urllib2.urlopen(url).read())

        # this .findChildren is giving off way too many false positives. must be a better call here. [todo]
        tables = soup.findChildren("table")
        data_tables = []
        for table in tables:
            if table.findParent("table") is None:
                data_tables.append(table)
        print "count: ", len(data_tables)
        print "other count: ", len(tables)

        # parse out individual rows.
        for table in data_tables:
            rows = table.findAll("tr", { "class" : "evenColor"})
            print "row count: ", len(rows)
            for row in rows:
                event = self.parseEventRow(row)
                print event
                events.append(event)
                #pdb.set_trace()

        return events
Ejemplo n.º 40
0
def getStats(link, games):
    score_page = requests.get(BASEURL + link)
    score_soup = BeautifulSoup(score_page.text, 'html.parser')

    #Team names
    headings = score_soup.find_all(class_='section_anchor')
    away_team = headings[3]['data-label'].split('(')[0].rstrip()
    home_team = headings[5]['data-label'].split('(')[0].rstrip()

    tables = score_soup.findChildren('table')
    #Basic stats
    away_basic_table = tables[0]
    home_basic_table = tables[2]
    away_basic_stats = ripStatsFromTable(away_basic_table, False)
    home_basic_stats = ripStatsFromTable(home_basic_table, True)

    #'Advanced' stats
    away_adv_table = tables[1]
    home_adv_table = tables[3]
    away_adv_stats = ripStatsFromTable(away_adv_table, False)
    home_adv_stats = ripStatsFromTable(home_adv_table, True)

    #combine both sets of stats into one dictionary
    home_stats = {**home_adv_stats, **home_basic_stats}
    away_stats = {**away_adv_stats, **away_basic_stats}
    title = score_soup.title.getText().split('|')[0].rstrip().split(
        ' Box Score')
    title2 = title[0] + title[1]

    game = {title2: {home_team: home_stats, away_team: away_stats}}

    games.append(game)
Ejemplo n.º 41
0
    def test_only_the_custom_region_is_created(self):
        caption_set = DFXPReader().read(
            SAMPLE_DFXP_TO_RENDER_WITH_ONLY_DEFAULT_POSITIONING_INPUT)

        new_region = Layout(alignment=Alignment(HorizontalAlignmentEnum.LEFT,
                                                VerticalAlignmentEnum.TOP))

        dfxp = SinglePositioningDFXPWriter(new_region).write(caption_set)
        # Using a different parser, because this preserves letter case
        # The output file is ok, but when parsing it, the "regular" parses
        # loses letter case.
        layout = BeautifulSoup(dfxp, features='xml').findChild('layout')

        self.assertEqual(len(layout.findChildren('region')), 1)

        region = layout.findChild('region')
        text_align = region['tts:textAlign']
        display_align = region['tts:displayAlign']

        internal_alignment = _create_internal_alignment(
            text_align, display_align)  # noqa
        self.assertEqual(internal_alignment.horizontal,
                         HorizontalAlignmentEnum.LEFT)  # noqa
        self.assertEqual(internal_alignment.vertical,
                         VerticalAlignmentEnum.TOP)  # noqa
Ejemplo n.º 42
0
def results(reg_no = "", pswd = ""):

	#logging into student login
	br = login(reg_no,pswd)

	#checking that are we logged in or not
	if br.geturl() == ("https://academics.vit.ac.in/student/stud_home.asp") or br.geturl() == ("https://academics.vit.ac.in/student/home.asp"):
		print "SUCCESS"

		br.open("https://academics.vit.ac.in/student/grade.asp?sem=WS")
		response = br.open("https://academics.vit.ac.in/student/grade.asp?sem=WS")
		soup = BeautifulSoup(response.get_data())

		#extracting tables
		tables = soup.findChildren('table')

		try:
			myTable = tables[1]
		except IndexError:
			myTable = 'null'
			return {"status" : "Not_Updated"}

		rows = myTable.findChildren(['th','tr'])
		result = {}

		return {"status" : "Updated"}
	else:
		print "FAIL"
		return {"status" : "Failure"}
Ejemplo n.º 43
0
def getNewData(thisURL):
    # reads XML file, converts to pandas dataFrame. Each row is one station.
    cabiBase = requests.get(thisURL)
    cabiSoup = BeautifulSoup(cabiBase.content,"lxml")
    CC = cabiSoup.findChildren()
    fnSoup = [x.name for x in CC]
    sta = cabiSoup.findAll('station')
    allContents = [x.contents for x in sta]
    fieldsHere = [[re.search('(?<=\<)\w+(?=>)',str(entry)).group(0) \
                for entry in x] for x in allContents]
    valuesHere = [[re.sub('&amp;','&',re.search('(?<=>)[^\<]*(?=\<)',str(entry)).group(0)) \
                             for entry in x] for x in allContents]              
    dNew = {}
    for ff in range(len(fieldsHere[0])):    # assumes they're all identical!
        thisField = fieldsHere[0][ff]
        thisType = getDtype(thisField)
        try:
            dNew.update({thisField:[thisType(x[ff]) for x in valuesHere]})
        except:
            temptemp = [x[ff] for x in valuesHere]
            temp2 = [thisType(x) if (len(x)) else -999 for x in temptemp]
            dNew.update({thisField:temp2})            
    overall_LastUpdate_sec = [int(CC[fnSoup.index('stations')].attrs['lastupdate'])/sec_2_msec]*(len(sta))
    zipIt = zip([1000000*OLU for OLU in overall_LastUpdate_sec],dNew['id'])
    DF = pd.DataFrame(dNew,index=[sum(zz) for zz in zipIt])
    return [DF,(cabiBase.content)]
Ejemplo n.º 44
0
    def get_class_instructor(class_number, term):
        """
        :returns: a string that is the instructor for CLASS_NUMBER in term TERM

        :param: class_number: String, class number
        :param: term: String, term number
        """

        url = 'http://www.courses.as.pitt.edu/detail.asp?CLASSNUM={}&TERM={}'.format(
            class_number, term)
        page = urlopen(url)
        soup = BeautifulSoup(page.read(), 'html.parser')
        table = soup.findChildren('table')[0]
        rows = table.findChildren('tr')

        for row in rows:
            cells = row.findChildren('td')

            try:
                for index, cell in enumerate(cells):
                    if len(cell.contents) > 0 and str(
                            cell.contents[0]) == 'Description':
                        prev = cells[index - 1]
                        return prev.string.strip()
            except Exception:
                print("blah")
Ejemplo n.º 45
0
def scrape_web(path: str, file: str) -> None:
    init_data = JsonFile(path, file)
    arguments = init_data.load()
    number_of_elements = len(arguments["search"])

    for index in range(number_of_elements):
        try:
            search_name = arguments["search"][index]
            web_request = requests.get(arguments["url"] + "/currencies/" +
                                       search_name)
            content = BeautifulSoup(web_request.content, 'lxml')
            table = content.findChildren('table')[0]
            rows = table.find_all('td')

            information = (search_name, ) + format_table(rows)
            current_coin = Coin(*information)

            if index == number_of_elements - 1:
                print(current_coin)
            else:
                print(current_coin, end="\n\n")

            file = current_coin.time + ".json"
            path = "out/" + current_coin.name
            coin_info = current_coin.__dict__
            result = JsonFile(path, file)
            result.save(coin_info)
        except Exception as error:
            print(f"Error: {error}")
Ejemplo n.º 46
0
    def get_course_numbers(subject, term, course_title):

        url = 'http://www.courses.as.pitt.edu/results-subja.asp?TERM={}&SUBJ={}'.format(
            term, subject)
        page = urlopen(url)
        soup = BeautifulSoup(page.read(), 'html.parser')
        table = soup.findChildren('table')[0]
        rows = table.findChildren('tr')

        course_numbers = []
        for row in rows:
            cells = row.findChildren('td')

            try:
                for index, cell in enumerate(cells):
                    if len(cell.contents) > 0 and str(
                            cell.contents[0]) == course_title:
                        prev = cells[index - 1]
                        course_numbers.append(prev.find('a').contents[0])
                        #print(prev.find('a').contents[0])
                        #print(cells)
            except Exception:
                print("blah")

        return course_numbers
Ejemplo n.º 47
0
    def get_class_time(class_number, term):
        """
        :returns: a string that is the class time for CLASS_NUMBER in term TERM

        :param: class_number: String, class number
        :param: term: String, term number
        """

        url = 'http://www.courses.as.pitt.edu/detail.asp?CLASSNUM={}&TERM={}'.format(
            class_number, term)
        page = urlopen(url)
        soup = BeautifulSoup(page.read(), 'html.parser')
        table = soup.findChildren('table')[0]
        rows = table.findChildren('tr')

        has_time = False
        for row in rows:
            cells = row.findChildren('td')
            for cell in cells:
                if has_time:
                    if len(cell) > 1:
                        return (cell.contents[0].string.strip() + ' and ' +
                                cell.contents[2].string.strip())
                    else:
                        return cell.contents[0].string.strip()
                if str(cell.contents[0]) == 'AT' or str(
                        cell.contents[0]) == 'SE3' or str(
                            cell.contents[0]) == 'ST' or str(
                                cell.contents[0]) == '6W1' or str(
                                    cell.contents[0]) == '6W2' or str(
                                        cell.contents[0]) == '12W':
                    has_time = True
Ejemplo n.º 48
0
    def test_only_the_custom_region_is_created(self):
        caption_set = DFXPReader().read(
            SAMPLE_DFXP_TO_RENDER_WITH_ONLY_DEFAULT_POSITIONING_INPUT)

        new_region = Layout(
            alignment=Alignment(
                HorizontalAlignmentEnum.LEFT, VerticalAlignmentEnum.TOP
            )
        )

        dfxp = SinglePositioningDFXPWriter(new_region).write(caption_set)
        # Using a different parser, because this preserves letter case
        # The output file is ok, but when parsing it, the "regular" parses
        # loses letter case.
        layout = BeautifulSoup(dfxp, features='xml').findChild('layout')

        self.assertEqual(len(layout.findChildren('region')), 1)

        region = layout.findChild('region')
        text_align = region['tts:textAlign']
        display_align = region['tts:displayAlign']

        internal_alignment = _create_internal_alignment(text_align, display_align)  # noqa
        self.assertEqual(internal_alignment.horizontal, HorizontalAlignmentEnum.LEFT)  # noqa
        self.assertEqual(internal_alignment.vertical, VerticalAlignmentEnum.TOP)  # noqa
Ejemplo n.º 49
0
def details(br):

    details = []
    r = br.submit()
    dsoup = BeautifulSoup(r.get_data())
    dtables = dsoup.findChildren('table')

    try:
        dmyTable = dtables[2]
        drows = dmyTable.findChildren(['th', 'tr'])
        drows = drows[2:]

        for drow in drows:

            dcells = drow.findChildren('td')
            details.append({
                "date": dcells[1].getText(),
                "slot": dcells[2].getText(),
                "status": dcells[3].getText(),
                "class_units": dcells[4].getText(),
                "reason": dcells[5].getText()
            })

    except:
        print "No_table"

    br.open(
        "https://academics.vit.ac.in/student/attn_report.asp?sem=WS&fmdt=09-Jul-2015&todt=%(to_date)s"
        % {"to_date": today})

    return details
def parse_data(url):
    response = requests.get(url)
    data = response.text
    soup = BeautifulSoup(data, 'html.parser')
    tables = soup.findChildren('table')
    my_table = tables[0]

    return my_table.findChildren(['tr'])
def parseData(epicNo):
    global iterator, formParams, headerList, notFoundCount
    # print "Finding data for Epic No", epicNo
    response = requests.get("http://164.100.180.4/searchengine/SearchEngineEnglish.aspx")
    soup = BeautifulSoup(response.text, 'lxml')

    # Prepare for inital Post Request to get the form which will contain the input field to enter EPIC No
    formParams = {}
    formParams = extractHiddenFields(formParams, soup)
    formParams = setDefaultFormFields(formParams)

    # Get the page to enter EPIC No
    selectedDistrictForm = requests.post('http://164.100.180.4/searchengine/SearchEngineEnglish.aspx', data = formParams)
    selectedDistrictSoup = BeautifulSoup(selectedDistrictForm.text, 'lxml')

    # Prepare for final request to get the required information
    finalFormParams = {}
    finalFormParams = extractHiddenFields(finalFormParams, selectedDistrictSoup)
    finalFormParams =  setDefaultFormFields(finalFormParams)
    finalFormParams['txtEPICNo'] = epicNo
    finalFormParams['RdlSearchBy'] = 0
    getVoterDetails = requests.post('http://164.100.180.4/searchengine/SearchEngineEnglish.aspx', data = finalFormParams)
    finalDetailsSoup = BeautifulSoup(getVoterDetails.text, 'lxml')

    # Details received now write it to a file
    with open('data.csv', 'a') as csvWriterFile:
        csvWriter = csv.writer(csvWriterFile)

        # Check if EPIC Number valid or not
        if(len(finalDetailsSoup.findChildren('table', {'id': 'gvSearchResult'}))) > 0:
            dateTable = finalDetailsSoup.findChildren('table', {'id': 'gvSearchResult'})[0]
            dataRow = dateTable.findChildren(['tr'])[1]
            dataCell = dataRow.findChildren('td')

            dataList = []
            dataCell.pop(0)
            for cell in dataCell:
                value = "" + cell.string 
                dataList.append(cell.string)
            # print dataList
            csvWriter.writerow(dataList)
        else:
                 notFoundCount += 0
                 notFound   = ["Not Found"] * 10
                 notFound.append(epicNo)
                 csvWriter.writerow(notFound)
Ejemplo n.º 52
0
    def test_only_the_default_region_is_created(self):
        caption_set = DFXPReader().read(
            SAMPLE_DFXP_TO_RENDER_WITH_ONLY_DEFAULT_POSITIONING_INPUT)

        dfxp = SinglePositioningDFXPWriter().write(caption_set)
        layout = BeautifulSoup(dfxp, features='html.parser').findChild('layout')  # noqa

        self.assertEqual(len(layout.findChildren('region')), 1)
Ejemplo n.º 53
0
Archivo: rest.py Proyecto: mkarnick/FF
    def GET(self, inLeague, inYear, inTeam=0):
        web.header('Content-Type', 'application/json')
        web.header('Access-Control-Allow-Origin', '*')
        web.header('Access-Control-Allow-Credentials', 'true')        # return 1
        #http://games.espn.go.com/ffl/clubhouse?leagueId=716644&teamId=5&seasonId=2014
        foundTeamRosters=[]
        if inTeam==0:
            inds = range(1,12)
        else:
            inds = [];inds.append(inTeam)
         
        for teamId in inds:
            url = 'http://games.espn.go.com/ffl/clubhouse?leagueId=%s&teamId=%s&seasonId=%s' %(inLeague, teamId, inYear)
            # return url
            soup = BeautifulSoup(urllib2.urlopen(url).read())
            # found = soup.findChildren('table')[0].findChildren('td', class_='playertablePlayerName')
            found = soup.findChildren('table')[0].findChildren('tr', class_='pncPlayerRow')
            foundPlayers=[]
            foundPositions=[]
            teamName = soup.findChildren('table')[0].findChildren(class_='team-name')[0].text
            realName = soup.findChildren('table')[0].findChildren(class_='per-info')[0].text
            for f in found:
                try:
                    fullString = str(f.findChildren('td')[1]).replace('\xa0',' ').replace('\xc2',' ')
                    posString = BeautifulSoup(fullString.replace(str(BeautifulSoup(fullString).a),'')).body.string.replace(', ','')

                    thisPlayerName = f.a.string
                    thisPlayerSlot = f.findChildren('td')[0].string
                    posStr = fullString.find('</td>')

                    strLen = len(fullString)
                    aLoc = fullString.find('</a>')
                    subString = fullString[strLen-7:strLen]
                    pos = subString[0:2]
                    #print fullString
                    #print subString
                    objPlayer = player(name=thisPlayerName, position=pos, slot=thisPlayerSlot)
                    #print pos
                    foundPlayers.append(objPlayer.__dict__)

                except:
                    pass
            foundTeamRosters.append(team(name=teamName, roster=foundPlayers,  realName=realName).__dict__) 

        return json.dumps(foundTeamRosters) 
def calScrape(br, row, i, calmarks):

	details = []
	cells = row.findChildren('td')

	br.select_form(nr=i)
	i = i+1

	r = br.submit()
	dsoup = BeautifulSoup(r.get_data())
	dtables = dsoup.findChildren('table')

	#if table is present
	try:
		dmyTable = dtables[2]
		
	#if table is absent
	except:

		br.open("https://academics.vit.ac.in/student/cal_da.asp?sem=WS")

		if cells[2].getText().replace("\r\n\t\t","") not in calmarks.keys():

			calmarks[cells[2].getText().replace("\r\n\t\t","")] = {"course_type" : cells[4].getText().replace("\r\n\t\t",""), "faculty" : cells[5].getText().replace("\r\n\t\t",""), "details" : details}

		else:
			if cells[4].getText().replace("\r\n\t\t","") == "Embedded Lab":
				calmarks[cells[2].getText().replace("\r\n\t\t","")+"L"] = {"course_type" : cells[4].getText().replace("\r\n\t\t",""), "faculty" : cells[5].getText().replace("\r\n\t\t",""), "details" : details}

			elif cells[4].getText().replace("\r\n\t\t","") == "Embedded Project":
				calmarks[cells[2].getText().replace("\r\n\t\t","")+"P"] = {"course_type" : cells[4].getText().replace("\r\n\t\t",""), "faculty" : cells[5].getText().replace("\r\n\t\t",""), "details" : details}

	else:

		drows = dmyTable.findChildren(['th','tr'])
		drows = drows[2:-1]

		for drow in drows:

			dcells = drow.findAll('td')
			details.append({"assignment_title" : dcells[1].getText(), "due_date" : dcells[2].getText(),"max_marks" : dcells[3].getText() ,"assignment_status" : dcells[5].getText() if dcells[5].getText() else "NA", "marks_status" : dcells[7].getText() if dcells[7].getText() else "NA", "marks_score" : dcells[8].getText() if dcells[3].getText() else "NA"})

		br.open("https://academics.vit.ac.in/student/cal_da.asp?sem=WS")

		if cells[2].getText().replace("\r\n\t\t","") not in calmarks.keys():

			calmarks[cells[2].getText().replace("\r\n\t\t","")] = {"course_type" : cells[4].getText().replace("\r\n\t\t",""), "faculty" : cells[5].getText().replace("\r\n\t\t",""), "details" : details}

		else:
			if cells[4].getText().replace("\r\n\t\t","") == "Embedded Lab":
				calmarks[cells[2].getText().replace("\r\n\t\t","")+"L"] = {"course_type" : cells[4].getText().replace("\r\n\t\t",""), "faculty" : cells[5].getText().replace("\r\n\t\t",""), "details" : details}

			elif cells[4].getText().replace("\r\n\t\t","") == "Embedded Project":
				calmarks[cells[2].getText().replace("\r\n\t\t","")+"P"] = {"course_type" : cells[4].getText().replace("\r\n\t\t",""), "faculty" : cells[5].getText().replace("\r\n\t\t",""), "details" : details}

	return calmarks
Ejemplo n.º 55
0
    def parse(self, response):
        sel = Selector(response)
        profile = {'url': response.url, 'skills': [], 'experience': []}

        # Parse current page URL (public profile URL)

        # Read Skills section
        skills_list = sel.xpath('//a[@class="endorse-item-name-text"]').extract()

        for skill in skills_list:
            skill = self.remove_tag('a', skill)
            profile['skills'].append(skill)

        # List of experience items
        exp_items = []

        # Read Companies and Titles
        exp_entries = sel.xpath('//div[contains(@id, "experience-") and contains(@id, "-view")]').extract()
        for exp_entry in exp_entries:
            b_soup = BeautifulSoup(exp_entry)

            #Get company name
            exp_company_matches = b_soup.findChildren('a', href=re.compile(r'prof-exp-company-name'))
            exp_company = exp_company_matches[len(exp_company_matches) - 1].get_text()\
                if len(exp_company_matches) > 0 else None

            # Get title within company
            exp_title = b_soup.findChild('a', {'name': 'title'}).get_text()

            # Get work description
            exp_desc_match = b_soup.findChild('p', {'class': 'description'})
            exp_desc = exp_desc_match.get_text() if exp_desc_match is not None else None

            # Get work date-locale
            exp_date_loc = b_soup.findChild('span', {'class': 'experience-date-locale'})

            exp_duration_items = exp_date_loc.findChildren('time')
            exp_is_current = 'Present' in exp_duration_items[1].get_text()
            exp_duration = re.sub(r'[^a-zA-Z0-9 ]', '', exp_duration_items[2].get_text()).strip()

            exp_location_item = exp_date_loc.findChild('span', {'class': 'locality'})
            exp_location = None
            if exp_location_item is not None:
                exp_location = re.sub(r'^[^"]*"', '', exp_location_item.get_text())
                exp_location = exp_location.replace("\"", "").strip()

            exp_items.append(ExperienceItem(exp_is_current, exp_title, exp_company,
                                            exp_location, exp_duration, exp_desc))

        profile['experience'] = exp_items

        # Sleep to appease LinkedIn rate limiting
        time.sleep(5)

        self.profile_map[response.url] = profile
        return LinkedInItem(profile)
Ejemplo n.º 56
0
def main():

    numOfArgs=len(sys.argv)

    if numOfArgs<4 or numOfArgs>4:

        print'Usage: A1.py <university> <sec> < URI>'
        print'e.g.: A1.py "old dominion" 60 http://sports.yahoo.com'
        sys.exit(1)

    print 'Number of arguments:', len(sys.argv), 'arguments.'
    univ = str(sys.argv[1])
    sec = int(sys.argv[2])
    uri = str(sys.argv[3])
    print 'Team Name: ' ,univ
    print 'Time to Sleep: ' ,sec
    print 'URI: ' ,uri

    response = requests.get(uri)      
    soup = BeautifulSoup(response.content)#gives you the html content of that page
    tables = soup.findChildren('table')#finds all the children of type table
    print "-" * 72
    #print tables[1].prettify()
    score_table = tables[1]#storing the results of the second table in score_table variable as our intersting stuff is in table
    # when you extract data from web and use beautiful soup it is stored in the form of array nothing but list in python


    while True:
    
        for row in score_table('tr', {'class' : 'game link' }):

            if univ.lower() in str(row).lower() :
                td_team_home = row('td', {'class' : 'home' }) 
                span_home    = td_team_home[0]('em')[0].contents[0]#the td_team_home is treated as a list so you have to get the contents of it

                td_team_away = row('td', {'class' : 'away' })
                span_away    = td_team_away[0]('em')[0].contents[0]    

                td_score     = row('td', {'class' : 'score' })
                span_home_score    = td_score[0]('span')[1].contents[0]
                span_away_score    = td_score[0]('span')[0].contents[0]

                print "*" * 8
                print span_home
                print span_home_score
                print

                print span_away
                print span_away_score
                print

                print 'Press ctrl+c to exit getting the scores'

                time.sleep(sec) # delays for 60 seconds
                print "*" * 8
                print "-" * 72
Ejemplo n.º 57
0
def add_page_to_index(url, html):
    body_soup = BeautifulSoup(html, "html.parser").find('body')
    for child_tag in body_soup.findChildren():
        if child_tag.name == 'script':
            continue
        child_text = child_tag.text
        for line in child_text.split('\n'):
            line = line.rstrip().lstrip()
            for word in _split_to_word(line):
                add_to_index(word, url)
Ejemplo n.º 58
0
def GetCourses(link):
	html = requests.get(link).text
	soup = BeautifulSoup(html, 'html.parser')
	table = soup.findChildren('tr',{'class': 'title'})

	results = []

	for row in table:
		results.append(row.find('a').find(text=True))

	return results
Ejemplo n.º 59
0
def parse_html(link, tag, tag_name):
	try:
		html = ""
		try:
			html = urlopen(link).read()
		except Exception as e:
			print "Error1 = " + str(e)
		soup = BeautifulSoup(html)
		data = soup.findChildren(attrs = {tag: re.compile(tag_name)})
		return data[0]
	except Exception as e:
		print "Error2 = " + str(e)