Beispiel #1
0
    def from_warc(warc_record):
        """
        Extracts relevant information from a WARC record. This function does not invoke scrapy but only uses the article
        extractor.
        :return:
        """
        raw_stream = warc_record.raw_stream.read()
        encoding = None
        try:
            encoding = warc_record.http_headers.get_header(
                'Content-Type').split(';')[1].split('=')[1]
        except:
            pass
        if not encoding:
            encoding = EncodingDetector.find_declared_encoding(raw_stream,
                                                               is_html=True)
        if not encoding:
            # assume utf-8
            encoding = 'utf-8'

        html = raw_stream.decode(encoding)
        url = warc_record.rec_headers.get_header('WARC-Target-URI')
        download_date = warc_record.rec_headers.get_header('WARC-Date')
        article = NewsPlease.from_html(html,
                                       url=url,
                                       download_date=download_date)
        return article
def getOneEntry(searchTerm):
    searchTerm = searchTerm.replace('\n', '')
    response = requests.get(
        urlSearchTemplate.format(searchTerm.replace(' ', '%20')))

    if response.ok:
        http_encoding = response.encoding if 'charset' in response.headers.get(
            'content-type', '').lower() else None
        html_encoding = EncodingDetector.find_declared_encoding(
            response.content, is_html=True)
        encoding = html_encoding or http_encoding
        soup = BeautifulSoup(response.content, 'lxml', from_encoding=encoding)

        result = processHtml(soup, searchTerm)

        if ("/tpl" in result[0]):
            result = getOneEntry2(result[1], result[0])

        resultSplited = result.split(',')
        if len(resultSplited) == 3:
            resultSplited = [i.decode('utf-8').strip() for i in resultSplited]
            nome = resultSplited[0]
            status = resultSplited[1]
            nome_aceito = resultSplited[1]
            return nome, status, nome_aceito
        else:
            return '', '', ''

    else:
        return 'Bad Response!'
Beispiel #3
0
def scrape_page(url):
    '''
    NOTE: this throws away any links that can't be addons (ie: assumes we're not going any deeper)
    '''
    resp = None
    links = set()
    if url.endswith('.jpg') or url.endswith('.png') or url.endswith(
            '.gif') or url.endswith('.rar'):
        return set()
    head = time_wrapper(requests.head, (url, ), t=3)
    if head:
        try:
            cl = int(head.headers['Content-Length'])
        except:
            cl = -1
        if cl < 1000000:
            resp = time_wrapper(requests.get, (url, ), t=3)
    if not resp:
        return set()
    netloc = urlparse(url).netloc.split(':')[0]
    http_encoding = resp.encoding if 'charset' in resp.headers.get(
        'content-type', '').lower() else None
    html_encoding = EncodingDetector.find_declared_encoding(resp.content,
                                                            is_html=True)
    encoding = html_encoding or http_encoding
    soup = BeautifulSoup(resp.content, from_encoding=encoding)
    for link in soup.find_all('a', href=True):
        if ".zip" in link['href'] or 'github' in link['href']:
            href = link['href']
            if not href.startswith('http'):
                href = 'http://' + netloc + '/' + href
            if can_be_repo(href):
                links.add(href)
    return links
Beispiel #4
0
def getHTML(url, verb=False):
    '''
    This function takes and url as an input and returns the corresponding
    bs4 object
    '''

    from bs4.dammit import EncodingDetector

    try:
        re = session.get(url, headers=headers, timeout=(10, 30))

    except:
        print(r'problem here')
        return (None)

    else:
        if re.status_code == 200:
            # dealing with encoding
            http_encoding = re.encoding if 'charset' in re.headers.get(
                'content-type', '').lower() else None
            html_encoding = EncodingDetector.find_declared_encoding(
                re.content, is_html=True)
            encoding = html_encoding or http_encoding

            # generating BeautifulSoup object
            bsObj = BeautifulSoup(re.content,
                                  'html5lib',
                                  from_encoding=encoding)

            if verb == True:
                print("The title of html is %s" % bsObj.title.getText())
            return (bsObj)
        else:
            return (None)
Beispiel #5
0
def scrape_politifact_article(story_url):
    resp = requests.get(story_url)
    http_encoding = resp.encoding if 'charset' in resp.headers.get('content-type', '').lower() else None
    html_encoding = EncodingDetector.find_declared_encoding(resp.content, is_html=True)
    encoding = html_encoding or http_encoding
    soup = BeautifulSoup(resp.content, 'lxml', from_encoding=encoding)
    return soup.find("div", "article__text").get_text()
def getLinks():
    parser = 'html.parser'  # or 'lxml' (preferred) or 'html5lib', if installed
    for i in range(1,100):
        if os.path.exists('pdfs/' + str(i)):
            print(str(i),'already exists')
            continue
        resp = requests.get("https://quizbowlpackets.com/"+str(i))
        http_encoding = resp.encoding if 'charset' in resp.headers.get('content-type', '').lower() else None
        html_encoding = EncodingDetector.find_declared_encoding(resp.content, is_html=True)
        encoding = html_encoding or http_encoding
        soup = BeautifulSoup(resp.content, parser, from_encoding=encoding)

        links = []

        allLinks = soup.find_all('a', href=True)
        combined = [True if 'pdf' in link['href'] else False for link in allLinks]
        if not any(combined):
            print(str(i), 'doesn\'t exist')
            continue

        for link in allLinks:
            link = link['href']
            if 'Packet' in link:
                links.append(link)
        print(links)
        with open('pdfs/' + str(i),'wb') as file:
            pickle.dump(links, file)
Beispiel #7
0
    def get_soup(self, _page=0):
        """ scrape web-site page """

        # get request
        self.__response = self.get_request()
        if self.__verbose:
            _log.debug(f'self.__response={self.__response}')

        # get encoding
        _http_encoding = self.__response.encoding if 'charset' in self.__response.headers.get(
            'content-type', '').lower() else None
        _html_encoding = EncodingDetector.find_declared_encoding(
            self.__response.content, is_html=True)

        # get soup
        self.__soup = None
        try:
            if self.__verbose:
                _log.debug(f'Getting soup from self.__response.text')
            self.__soup = BeautifulSoup(self.__response.text,
                                        features='html5lib',
                                        from_encoding=(_html_encoding
                                                       or _http_encoding))
            if self.__verbose:
                _log.debug(f'Got soup from self.__response.text OK')
        except Exception as e:
            self.__soup = None
            if self.__verbose:
                _log.error(
                    f'Failed to get soup from self.__response.text, error={e}')
Beispiel #8
0
    def from_warc(warc_record, decode_errors="replace"):
        """
        Extracts relevant information from a WARC record. This function does not invoke scrapy but only uses the article
        extractor.
        :return:
        """
        raw_stream = warc_record.raw_stream.read()
        encoding = None
        try:
            encoding = warc_record.http_headers.get_header('Content-Type').split(';')[1].split('=')[1]
        except:
            pass
        if not encoding:
            encoding = EncodingDetector.find_declared_encoding(raw_stream, is_html=True)
        if not encoding:
            # assume utf-8
            encoding = 'utf-8'

        try:
            html = raw_stream.decode(encoding, errors=decode_errors)
        except LookupError:
            # non-existent encoding: fallback to utf-9
            html = raw_stream.decode('utf-8', errors=decode_errors)
        if not html:
            raise EmptyResponseError()
        url = warc_record.rec_headers.get_header('WARC-Target-URI')
        download_date = warc_record.rec_headers.get_header('WARC-Date')
        article = NewsPlease.from_html(html, url=url, download_date=download_date)
        return article
Beispiel #9
0
def otherTeams(sumOfQ):
    remainigTeams = 16- sumOfQ
    finalTeams = []
    if remainigTeams <=0:
        return []
    else:
        randomTeams = sample(range(10),remainigTeams)
        randomTeamsIndex = sample(range(remainigTeams+2),remainigTeams)
        randomTeamSelection = choices(range(len(league_qlf_list)),k=remainigTeams)
        for i in range(0,remainigTeams):
            temp_x = randomTeamSelection[i]
            try:
            	tempClubsLst = []
            	url = 'https://www.worldfootball.net'+league_qlf_list[temp_x]
            	source = requests.get(url, headers=header)
            	http_encoding = source.encoding if 'charset' in source.headers.get('content-type', '').lower() else None
            	html_encoding = EncodingDetector.find_declared_encoding(source.content, is_html=True)
            	encoding = html_encoding or http_encoding
            	soup = BeautifulSoup(source.content, 'lxml', from_encoding=encoding)
            	find_boxS= soup.find('div',class_="scrollable_tables")
            	the_team_table = find_boxS.find('table', {'class':'standard_tabelle'})
            	for theTeamAtag in the_team_table.find_all('a',href=True):
            		if theTeamAtag.text:
            			tempClubsLst.append(theTeamAtag.text)
            	y = randomTeamsIndex[i]
            	teamNames = tempClubsLst[y]
            	finalTeams.append(teamNames)
            except Exception as e:
                print(e)
        return finalTeams
    def getSteam(self, q, size):
        querys = q.replace(" ", "+")
        url = ('https://store.steampowered.com/search/?term=' + str(querys) +
               '&category1=998')
        resp = requests.get(url)
        http_encoding = resp.encoding if 'charset' in resp.headers.get(
            'content-type', '').lower() else None
        html_encoding = EncodingDetector.find_declared_encoding(resp.content,
                                                                is_html=True)
        encoding = html_encoding or http_encoding
        soup = BeautifulSoup(resp.content,
                             from_encoding=encoding,
                             features="lxml")
        print(url)

        SteamLinkList = []

        #find links to apps
        for link in soup.find_all('a', href=re.compile('app')):
            #remove duplicates
            if (link['href'] not in SteamLinkList):
                SteamLinkList.append(link['href'])

        #remove first two irrelevant links
        return SteamLinkList[2:size + 2]
Beispiel #11
0
def desi_crawler(u_r_l):
    web_list = []
    url = u_r_l
    web_list.append(url)
    domain = url

    if "www." not in domain:
        div = domain.replace('//', ' ').replace('.', ' ').split()
        domain = div[1]
    else:
        div = domain.replace('//', ' ').replace('.', ' ').split()
        domain = div[2]

    for url in web_list:
        response = requests.get(url)
        http_encoding = response.encoding if 'charset' in response.headers.get(
            'content-type', '').lower() else None
        html_encoding = EncodingDetector.find_declared_encoding(
            response.content, is_html=True)
        encoding = html_encoding or http_encoding
        soup = BeautifulSoup(response.content, from_encoding=encoding)

        for link in soup.find_all('a', href=True):
            if domain in link['href']:
                if link['href'] not in web_list:
                    web_list.append(link['href'])
Beispiel #12
0
def top4leagues(leagueList,index): 
    rangeOfWork = team_qulfied[index]
    defaultLst = []
    temIndex = sample(range(12),rangeOfWork)
    for i in range(rangeOfWork):
        clubsIndex = randint(0,1)
        try:
            tempClubsLst = []
            url = 'https://www.worldfootball.net'+leagueList[clubsIndex]
            source = requests.get(url, headers=header)
            http_encoding = source.encoding if 'charset' in source.headers.get('content-type', '').lower() else None
            html_encoding = EncodingDetector.find_declared_encoding(source.content, is_html=True)
            encoding = html_encoding or http_encoding
            soup = BeautifulSoup(source.content, 'lxml', from_encoding=encoding)
            find_boxS= soup.find('div',class_="scrollable_tables")
            the_team_table = find_boxS.find('table', {'class':'standard_tabelle'})
            for theTeamAtag in the_team_table.find_all('a',href=True):
                if theTeamAtag.text:
                    tempClubsLst.append(theTeamAtag.text)
            y = temIndex[i]
            teamNames = tempClubsLst[y]
            defaultLst.append(teamNames)
            tempClubsLst.pop()
        except Exception as e:
            print(e)
    return defaultLst
Beispiel #13
0
    def getIMDB(self, queryi):

        url = ('https://www.imdb.com/search/keyword/?keywords=' + str(queryi) +
               '&ref_=fn_kw_kw_1&mode=detail&page=1&sort=moviemeter,asc')
        resp = requests.get(url)
        http_encoding = resp.encoding if 'charset' in resp.headers.get(
            'content-type', '').lower() else None
        html_encoding = EncodingDetector.find_declared_encoding(resp.content,
                                                                is_html=True)
        encoding = html_encoding or http_encoding
        soup = BeautifulSoup(resp.content,
                             from_encoding=encoding,
                             features="lxml")
        print(url)

        imdbLinkList = []

        #find links to titles
        for link in soup.find_all('a', href=re.compile('title')):
            #remove irrelevant links
            if "vote" not in link['href'] and "search" not in link[
                    'href'] and "plotsummary" not in link['href']:
                #remove duplicates
                if ('https://www.imdb.com' + link['href'] not in imdbLinkList):
                    imdbLinkList.append('https://www.imdb.com' + link['href'])

        return imdbLinkList
def prepare_complete_links(url):

    http_regex = re.compile(r'http')
    page = requests.get(url)
    http_encoding = page.encoding if 'charset' in page.headers.get(
        'content-type', '').lower() else None
    html_encoding = EncodingDetector.find_declared_encoding(page.content,
                                                            is_html=True)
    encoding = html_encoding or http_encoding
    soup = BeautifulSoup(page.content, 'html.parser', from_encoding=encoding)
    complete_links = []
    for alink in soup.find_all('a', href=True):
        if http_regex.search(alink['href']) is not None:
            complete_links.append(alink['href'])
            print(
                http_regex.search(alink['href']).group() + "---" +
                alink['href'])
        elif 'javascript' not in alink['href'] and len(
                alink['href'].strip()) > 0:
            if alink['href'][:1] == '/':
                temp_link = TWM_DOMAIN + alink['href']
                complete_links.append(temp_link)
                print("need http" + "---" + alink['href'])
            else:
                temp_link = TWM_DOMAIN + "/" + alink['href']
                complete_links.append(temp_link)

    return list(set(complete_links))
Beispiel #15
0
def compile_links(web_address):
    '''
    compile_links accesses a webpage at a given address,
    finds all of the links on that page, and appends certain links
    to a list called links_list.

    compile links works together with find_diffraction_files to
    get only the relevant links.

    inputs are a web address, and the list for storing links
    '''

    html_page = requests.get(web_address)
    http_encoding = html_page.encoding if 'charset' in\
        html_page.headers.get('content-type', '').lower() else None
    html_encoding =\
        EncodingDetector.find_declared_encoding(html_page.content,
                                                is_html=True)
    encoding = html_encoding or http_encoding
    soup = BeautifulSoup(html_page.content, from_encoding=encoding,
                         features="html.parser")
    links_list = []

    permutation_attempt = soup(text=re.compile("Now trying variations on your request:"))
    if len(permutation_attempt) is not 0:
        return links_list

    for link in soup.find_all(href=find_diffraction_files):
        links_list.append('http://rruff.geo.arizona.edu'+link['href'])

    return links_list
Beispiel #16
0
def get_all_uic_links_from_url(base_url, h=None):
    resp = requests.get(base_url, headers=headers)
    base_url = resp.url
    if is_url_end_point(base_url):
        return [], ""
    http_encoding = resp.encoding if 'charset' in resp.headers.get(
        'content-type', '').lower() else None
    html_encoding = EncodingDetector.find_declared_encoding(resp.content,
                                                            is_html=True)
    encoding = html_encoding or http_encoding

    soup = BeautifulSoup(resp.content, from_encoding=encoding)
    uic_link_list = []
    for link in soup.find_all('a', href=True):
        if is_url_end_point(link['href']):
            continue
        target_url = ''
        o = urlparse(link['href'])
        if "uic.edu" in o.netloc:
            target_url = link['href'].rstrip('/')
        elif not is_absolute(link['href']):
            target_url = (urllib.parse.urljoin(base_url,
                                               link['href'])).rstrip('/')
        target_url = target_url.replace("http:", "https:")

        if target_url is not '':
            uic_link_list.append(target_url)
    return list(set(uic_link_list)), h.handle(resp.text)
def get_url_soup(url):
    url_request = requests.get(url, headers=headers, allow_redirects=True)
    http_encoding = url_request.encoding if 'charset' in url_request.headers.get(
        'content-type', '').lower() else None
    html_encoding = EncodingDetector.find_declared_encoding(
        url_request.content, is_html=True)
    encoding = html_encoding or http_encoding
    return BeautifulSoup(url_request.content, 'lxml', from_encoding=encoding)
Beispiel #18
0
 def doc_encoding(self) -> str:
     http_encoding = self.doc.encoding if "charset" in self.doc.headers.get(
         "Content-Type", "").lower() else None
     html_encoding = EncodingDetector.find_declared_encoding(
         self.doc.content, is_html=True)
     encoding: str = str(html_encoding or http_encoding)
     self.sdoc.encoding = encoding
     return encoding
def get_text(html):
    # Detect encoding and extract plain text from page
    encoding = EncodingDetector.find_declared_encoding(html, is_html=True)
    soup = BeautifulSoup(html, "lxml", from_encoding=encoding)
    for script in soup(["script", "style"]):
        script.extract()

    return soup.get_text(" ", strip=True)
Beispiel #20
0
 def get_html_title(self, page, record):
     try:
         encoding = EncodingDetector.find_declared_encoding(page,
                                                            is_html=True)
         soup = BeautifulSoup(page, "lxml", from_encoding=encoding)
         title = soup.title.string.strip()
         return title
     except:
         return ""
 def grab_projects(self, resp):
     http_encoding = resp.encoding if 'charset' in resp.headers.get('content-type', '').lower() else None
     html_encoding = EncodingDetector.find_declared_encoding(resp.content, is_html=True)
     encoding = html_encoding or http_encoding
     soup = BeautifulSoup(resp.content, from_encoding=encoding)
     links = [self.BASE_URL + link['href'] for link in soup.find_all('a', href=True) if
              link['href'].startswith("/projects/")]
     self.add_to_queue(urls=links, website_name=self.NAME)
     return len(links)
Beispiel #22
0
def get_html(url):
    headers = {"User-Agent": USERAGENT}
    resp = requests.get(url, headers=headers)
    http_encoding = resp.encoding if 'charset' in resp.headers.get(
        'content-type', '').lower() else None
    html_encoding = EncodingDetector.find_declared_encoding(resp.content,
                                                            is_html=True)
    encoding = html_encoding or http_encoding
    return resp.text
Beispiel #23
0
def get_soup_for_url(base_url):
    resp = requests.get(base_url)
    http_encoding = resp.encoding if 'charset' in resp.headers.get(
        'content-type', '').lower() else None
    html_encoding = EncodingDetector.find_declared_encoding(resp.content,
                                                            is_html=True)
    encoding = html_encoding or http_encoding
    soup = BeautifulSoup(resp.content, from_encoding=encoding)
    return soup
def get_source_html(url):
    headers = {"User-Agent": 'Chrome'}
    resp = requests.get(url, headers=headers)
    http_encoding = resp.encoding if 'charset' in resp.headers.get(
        'content-type', '').lower() else None
    html_encoding = EncodingDetector.find_declared_encoding(resp.content,
                                                            is_html=True)
    encoding = html_encoding or http_encoding
    webpage = BeautifulSoup(resp.content, 'lxml', from_encoding=encoding)
    return webpage
Beispiel #25
0
 def get_html_text_body(self, page, record):
     try:
         encoding = EncodingDetector.find_declared_encoding(page,
                                                            is_html=True)
         soup = BeautifulSoup(page, "lxml", from_encoding=encoding)
         for script in soup(["script", "style"]):
             script.extract()
         return soup.get_text(" ", strip=True)
     except:
         return ""
Beispiel #26
0
def get_soup_html(url, headers=GET_HEADER):
    resp = SESSION.get(url, headers=headers)
    http_encoding = resp.encoding if 'charset' in resp.headers.get(
        'content-type', '').lower() else None
    html_encoding = EncodingDetector.find_declared_encoding(resp.content,
                                                            is_html=True)
    encoding = html_encoding or http_encoding
    soup = BeautifulSoup(resp.content, 'lxml', from_encoding=encoding)

    return soup
Beispiel #27
0
def getSoup(matchUrl):
    res = requests.get(matchUrl)
    res.raise_for_status()

    http_encoding = res.encoding if 'charset' in res.headers.get(
        'content-type', '').lower() else None
    html_encoding = EncodingDetector.find_declared_encoding(res.content,
                                                            is_html=True)
    encoding = html_encoding or http_encoding

    return bs4.BeautifulSoup(res.content, 'lxml', from_encoding=encoding)
Beispiel #28
0
 def get_text(self, record):
     try:
         content = record.content_stream().read()
         encoding = EncodingDetector.find_declared_encoding(content,
                                                            is_html=True)
         soup = BeautifulSoup(content, "lxml", from_encoding=encoding)
         # strip all script and style elements
         for script in soup(["script", "style"]):
             script.decompose()
         return soup.get_text(" ", strip=True)
     except:
         return ""
Beispiel #29
0
    def get_the_soup(self):
        if self.url is None:
            # Early exit
            return

        resp = requests.get(self.url, proxies=urllib.request.getproxies())
        http_encoding = (resp.encoding if "charset" in resp.headers.get(
            "content-type", "").lower() else None)
        html_encoding = EncodingDetector.find_declared_encoding(resp.content,
                                                                is_html=True)
        encoding = html_encoding or http_encoding
        soup = BeautifulSoup(resp.content, "lxml", from_encoding=encoding)
        adjust_anchors(soup)
        return soup
def getData(searchTerm,
            offset=0,
            inputFile=os.path.join('data', 'ListaMacrofitasResult.csv')):

    response = requests.get(urlSearchTemplate.format(searchTerm, offset))

    registries = []
    not_found_registries = []

    if response.ok:

        # Fetch enconding used from source
        http_encoding = response.encoding if 'charset' in response.headers.get(
            'content-type', '').lower() else None
        html_encoding = EncodingDetector.find_declared_encoding(
            response.content, is_html=True)
        encoding = html_encoding or http_encoding
        soup = BeautifulSoup(response.content, 'lxml', from_encoding=encoding)

        divs = soup.findAll("div", {"class": "record"})

        try:
            page_hint = soup.find("div", {"id": "div_hint_summary"})
        except:
            writeNotFoundOutput(searchTerm)
            return

        # If no results are found, write searchTerm to file
        if page_hint == None or "Nenhum registro encontrado" in page_hint.find(
                "b").text:
            writeNotFoundOutput(searchTerm)
        else:
            hints = page_hint.findAll('ll')

            offset = int(hints[1].text)
            max_registries = int(hints[2].text)

            for div in divs[1:]:
                scientificName, municipality, state, country, latitude, longitude, date = parseDiv(
                    div)
                registries.append('{},{},{},{},{},{},{}'.format(
                    searchTerm, municipality, state, country, latitude,
                    longitude, date))

            writeOutput(registries)

            if (offset < max_registries):
                getData(searchTerm, offset)
    else:
        response.raise_for_status()
Beispiel #31
0
def get_html_encoding(html):
	return EncodingDetector.find_declared_encoding(html, is_html=True, search_entire_document=False)