def getACMRecords(searchInput, bibs, years=''):
    searchString = getSearchString(searchInput) + years
    print("from get acm records", searchString)
    url = "https://dl.acm.org/action/doSearch?fillQuickSearch=false&expand=dl" + searchString + '&pageSize=50'
    print(url)
    try:
        x = requests.get(url)
        parsed_html = sp(x.text, "html.parser")

        totalNumber = int(
            parsed_html.find("span", {
                "class": "hitsLength"
            }).string.strip().replace(',', ''))
        totalPages = ceil(totalNumber / 50)
        print("Total Results: ", totalNumber)
        print("Total Pages: ", totalPages)

        # Extract first page
        bibs += scrape(parsed_html)
        # return
        for i in range(1, totalPages):
            print("Page ", i)
            next_page = url + "&pageSize=50&startPage=" + str(i)
            print(next_page)
            x = requests.get(next_page)
            parsed_html = sp(x.text, "html.parser")
            bibs += scrape(parsed_html)
        print("total bibs from acm", len(bibs))
    except:
        print("Results are none")
    return bibs
Ejemplo n.º 2
0
def get_ads(category, pages_count):
    ads_links = []
    for page in range(pages_count):
        refurl = f'https://www.olx.com.eg/{category}/?page={page+1}'
        if page == 1:
            refurl = f'https://www.olx.com.eg/{category}/'

        ajx_url = 'https://www.olx.com.eg/ajax/search/list/'
        print(f'=========={page+1}==========')
        print(refurl)
        headers = {
            'accept':
            '*/*',
            'accept-encoding':
            'gzip, deflate, br',
            'accept-language':
            'en-US,en;q=0.9',
            'referer':
            refurl,
            'user-agent':
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36'
        }
        response = get(ajx_url, headers=headers)
        soup = sp(response.content, 'lxml')
        ads = soup.find_all('div', class_='ads__item__info')
        for ad in ads:
            ads_links.append(ad.a['href'])

    return ads_links
Ejemplo n.º 3
0
 def tc(self):
     pg = r(
         techPage.url1,
         headers={
             'User-Agent':
             'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36'
         })
     pg = uo(pg)
     pg_ht = pg.read()
     pg.close()
     soup = sp(pg_ht, 'html.parser')
     logourl = '<img class="img-fluid" src="https://indianexpress.com/wp-content/themes/indianexpress/images/indian-express-logo-n.svg" title="The Indian Express" alt="The Indian Express">'
     heads = []
     links = []
     imgs = []
     logos = []
     try:
         imgsl = soup.find('ul', {'class': 'article-list'}).findAll('li')
         for i in imgsl:
             imgs.append(i.find('img').get('src'))
             links.append(i.find('a').get('href'))
             logos.append(logourl)
             heads.append(i.find('img').get('alt'))
         news = list(zip(imgs, heads, links, logos))
         return news
     except:
         news = []
         return news
Ejemplo n.º 4
0
def dork():#Recuperation des urls
  menu(1)
  print"\033[94m[INFO]Rechercher lance\033[0m"
  a = 0
  b = 0
  c = sys.argv[2]
  page1 = open("page.html","w")
  try:
   while a<100:
    try:
      r = requests.get("http://www.ask.com/web?q="+str(c)+"&page="+str(a))
    except IOError,e:
      print str(e)
      break
    print str(b)
    m = str(r.text)
    soup = sp(m)
    for i in soup.find_all("p",{"class":"web-result-url"}):
       lien=  i.text
       try:
         page1.write(lien+"\n")
       except:
         pass
       print "[\033[92m"+str(b)+"\033[0m]--> "+str(lien)
       b+=1
    a+=1
  except IOError,e:
    print (bcolors.FAIL+"Stoped\033[0m\n"+str(e) )      
Ejemplo n.º 5
0
def dork():  #Recuperation des urls
    menu(1)
    print "\033[94m[INFO]Rechercher lance\033[0m"
    a = 0
    b = 0
    c = sys.argv[2]
    page1 = open("page.html", "w")
    try:
        while a < 100:
            try:
                r = requests.get("http://www.ask.com/web?q=" + str(c) +
                                 "&page=" + str(a))
            except IOError, e:
                print str(e)
                break
            print str(b)
            m = str(r.text)
            soup = sp(m)
            for i in soup.find_all("p", {"class": "web-result-url"}):
                lien = i.text
                try:
                    page1.write(lien + "\n")
                except:
                    pass
                print "[\033[92m" + str(b) + "\033[0m]--> " + str(lien)
                b += 1
            a += 1
    except IOError, e:
        print(bcolors.FAIL + "Stoped\033[0m\n" + str(e))
Ejemplo n.º 6
0
def look_up(entry):
    url = "https://baike.baidu.com/item/" + quote(entry)

    req = request.Request(url, headers=header)

    html = request.urlopen(req).read()
    soup = sp(html, "html.parser")
    le2 = (soup.select('.para-title.' + 'level-2 '))

    le2 = (soup.findAll('h2', {'class': "title-text"}))
    #print(le2)
    x = 1
    msg = []
    msg1 = []
    for l in le2:
        msg.append(l.text.replace(\
         '<bound method Tag.get_text of <h2 class="title-text"><span class="title-prefix">','').replace('</span>','').replace('</h2>>',''))
        msg1.append(url + "#" + str(x))
        x = x + 1

    le3 = (soup.select(".para-title\0level-3 "))

    content = soup.findAll('div', {'class': 'para'})
    for i in content:
        i = i.get_text()
        i = i.replace('\n', '')
        i = i.replace('\r', '')
        i = re.sub(bracket, '', i)
    #    print(i)

    # for k in range(len(msg) - 1):
    #     return msg[k]
    # for f in range(len(msg1) - 1):
    #     return msg1[f]
    return msg, msg1
Ejemplo n.º 7
0
 def sc1(self):
     pg = r(
         homePage.url2,
         headers={
             'User-Agent':
             'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36'
         })
     pg = uo(pg)
     pg_ht = pg.read()
     pg.close()
     soup = sp(pg_ht, 'html.parser')
     logourl = '<img alt="Times of India" class="img-fluid" src="https://static.mediawire.in/brands/profilepic/1117/TOI%20Logo%20in%20Red%20Bakcground.jpg">'
     heads = []
     links = []
     imgs = []
     logos = []
     try:
         imgs1 = soup.find('div', {
             'class': 'listing4 clearfix'
         }).find('ul').findAll('li')
         for i in imgs1:
             heads.append(i.find('span').find('a').text)
             links.append(i.find('span').find('a').get('href'))
             imgs.append(i.find('a').find('img').get('data-src'))
             logos.append(logourl)
         news = list(zip(imgs, heads, links, logos))
         return news
     except:
         news = []
         return news
Ejemplo n.º 8
0
 def sp(self):
     # pg=r(url1,{'User-Agent':'Magic Browser'})
     pg = r(
         sportPage.url1,
         headers={
             'User-Agent':
             'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36'
         })
     pg = uo(pg)
     pg_ht = pg.read()
     pg.close()
     soup = sp(pg_ht, 'html.parser')
     logourl = '<img class="img-fluid" src="https://indianexpress.com/wp-content/themes/indianexpress/images/indian-express-logo-n.svg" title="The Indian Express" alt="The Indian Express">'
     heads = []
     links = []
     imgs = []
     logos = []
     try:
         imgsl = soup.find('div', {
             'class': 'nation'
         }).findAll('div', {'class': 'snaps'})
         headsl = soup.find('div', {
             'class': 'nation'
         }).findAll('h2', {'class': 'title'})
         for i in imgsl:
             links.append(i.find('a').get('href'))
             logos.append(logourl)
             imgs.append(i.find('img').get('data-lazy-src'))
         for i in headsl:
             heads.append(i.find('a').text)
         news = list(zip(imgs, heads, links, logos))
         return news
     except:
         news = []
         return news
Ejemplo n.º 9
0
 def tc2(self):
     pg = r(
         techPage.url3,
         headers={
             'User-Agent':
             'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36'
         })
     pg = uo(pg)
     pg_ht = pg.read()
     pg.close()
     soup = sp(pg_ht, 'html.parser')
     logourl = '<img class="img-fluid" src="https://cdn.gadgets360.com/gadgets360_logo.png" alt="Technology News" title="NDTV Gadgets 360">'
     heads = []
     links = []
     imgs = []
     logos = []
     try:
         imgsl = soup.find('div', {
             'class': 'story_list row margin_b30'
         }).findAll('div', {'class': 'thumb'})
         for i in imgsl:
             if i.find('img').get(
                     'src'
             ) == "https://gadgets.ndtv.com/static/icons/img_120n.png":
                 imgs.append(i.find('img').get('data-original'))
             else:
                 imgs.append(i.find('img').get('src'))
             links.append(i.find('a').get('href'))
             logos.append(logourl)
             heads.append(i.find('img').get('alt'))
         news = list(zip(imgs, heads, links, logos))
         return news
     except:
         news = []
         return news
Ejemplo n.º 10
0
def loadImage():
    for j in range(3):
        j += 2
        urlLink = 'https://www.27270.com/tag/333'
        urlLink = urlLink + '_' + str(j) + '.html'
        print("正在爬取第%d页" % j)
        imLinks=getLink(urlLink)
        for imLink in imLinks:
            for i in range(40):
                i += 1
                imLink1 = imLink[0:-5]
                imLink2 = imLink1 + '_' + str(i) + '.html'
                print(imLink2)
                content = loadPage(imLink2)
                html = sp(content, 'html.parser')
                try:
                    link = html.find_all('img', attrs={'alt':True,'height':False})[0]
                    time.sleep(3)
                    # print(link)
                    if link is None:
                        print("爬取完成")
                        pass
                    else:
                        name1 = link.get('alt')
                        name1 = re.sub("[A-Za-z0-9\!\%\[\]\,\。]", "", name1)
                        link1 = link.get('src')
                        name1 = name1 + str(i)
                        print('正在爬取' + name1)
                        saveImage(link1,name1)
                except:
                    print("爬取完成")
                    break
Ejemplo n.º 11
0
def formdata(links):
    data=[]
    for i in links:
        try:
            html=requests.get(i).text
            soup=sp(html,'lxml')
            url=i
            title=soup.find('title').text
            tag=''
            category=''
            try:
                tag=getTagAndCateByLink(i,'Tags')
            except Exception:
                pass
            try:
                category=getTagAndCateByLink(i,'Categories')
            except Exception:
                pass
            try:
                catchhtml=soup.find(id='cnblogs_post_body')
                catchhtml=str(catchhtml)
                data.append(article(url, title, catchhtml,tag,category))
            except Exception:
                print(i+' has no main')
        except Exception as e:
            print e
    return data
Ejemplo n.º 12
0
 def ec(self):
     # pg=r(url1,{'User-Agent':'Magic Browser'})
     pg = r(
         ecoPage.url1,
         headers={
             'User-Agent':
             'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36'
         })
     pg = uo(pg)
     pg_ht = pg.read()
     pg.close()
     soup = sp(pg_ht, 'html.parser')
     logourl = '<img class="img-fluid" src="https://www.financialexpress.com/wp-content/themes/vip/financialexpress/assets/images/fe-logo-with-read-to-lead.svg" alt="Financial Express">'
     heads = []
     links = []
     imgs = []
     logos = []
     try:
         imgl1 = soup.find('div', {'class': 'leftcol'}).findAll('figure')
         titles1 = soup.find('div', {'class': 'leftcol'}).findAll('h2')
         titles2 = soup.find('div', {'class': 'leftcol'}).findAll('h3')
         for i in imgl1:
             imgs.append(i.find('img').get('data-src'))
             links.append(i.find('a').get('href'))
             logos.append(logourl)
         for i in titles1:
             heads.append(i.find('a').text)
         for i in titles2:
             heads.append(i.find('a').text)
         news = zip(imgs, heads, links, logos)
         return news
     except:
         news = []
         return news
Ejemplo n.º 13
0
def update():
    local_time = time.ctime(time.time())
    url = "https://rate.bot.com.tw/gold?Lang=zh-TW"
    with request.urlopen(url) as response:
        data = response.read().decode("utf-8")
    root = sp(data, "html.parser")
    goal_in = root.find_all("td")[5].text.replace("回售", "").strip()
    goal_out = root.find_all("td")[2].text.replace("買進", "").strip()
    s1 = ("\nGold" + "\n銀行買進: " + goal_in + "\n銀行賣出: " + goal_out)
    note = local_time + s1
    lb.config(text=note)
Ejemplo n.º 14
0
def index(request, url):
    conn = urllib.urlopen(url)
    respose = conn.read()
    cleanSoup = sp(respose, "html.parser")
    try:
        for a in cleanSoup.findAll('a'):
            a['href'] = "/proxy/" + a['href']
    except:
        print "err"
    respose = str(cleanSoup)
    return HttpResponse(respose)
Ejemplo n.º 15
0
def getLink(url):
    imList = []  # 用来存放图片链接
    content = loadPage(url)
    # html = sp(content, 'html.parser')
    html = sp(content, 'html.parser')
    for link in html.find_all('ul', attrs={'id':'Tag_list'}):
        link = link.find_all('a', attrs={'target':'_blank'})
        for link1 in link:
            link1 = link1.get('href')
            imList.append(link1)
            # rint(link1)
    return imList
Ejemplo n.º 16
0
def getPageLink(start,end):
    links=[]
    for i in range(start,end+1):
        payload['PageIndex']=i
        try:
            response = requests.post(next,headers=headers,data=json.dumps(payload))
            s = sp(response.text, 'lxml')
            link = s.findAll(class_='titlelnk')
            for j in link:
                links.append(j.get('href'))
        except Exception as e:
            print e
    return  links
    def search_process(self):  #get the actual result process

        data = self.search_init()
        x = "\nSearching results.....\n"
        for i in x:
            print(i, end=" ")
            time.sleep(.200)
            sys.stdout.flush()

        #print("Searching for results....")
        soup = sp(data.content, "lxml")
        data_scrap = soup.find("a", {"class": "result__a"}).get_text()
        return data_scrap
Ejemplo n.º 18
0
 def getLinks(articleUrl,pageNum):
      page=requests.get("http://en.wikipedia.org"+articleUrl,timeout=5)
      html=page.content
      html=html.decode('utf-8')
      bsObj=sp(html,"html.parser")
      info=bsObj.find("div",{"class":"mw-parser-output"})
      Filter ={'script','nonscipt','style'}
      for items in Filter:
          for va in info.find_all(items):
              va.decompose;
      filepath='C:/Users/qyjbo/Desktop/Web_English/'
      f=open(filepath+str(pageNum)+'.txt','w',encoding='gb18030')  
      f.write(info.get_text())
      return bsObj.find("div",{"id":"bodyContent"}).findAll("a",href=re.compile("^(/wiki/)((?!:).)*$"))
Ejemplo n.º 19
0
def save_html(result_url):
    html_folder_path = os.getcwd() + "/" + "w2" + "_folder"
    os.mkdir(html_folder_path)
    index = 1
    print("Save html Begin...")
    for url in result_url:
        print("saving num " + str(index))
        response = requests.get(url).text
        time.sleep(1)
        html = str(sp(response, "html.parser"))
        html_path = html_folder_path + "/" + str(index) + ".txt"
        html_fw = open(html_path, "w")
        html_fw.write(html)
        html_fw.close()
        index += 1
Ejemplo n.º 20
0
def web_crawl(url, dat):
    headers = {
        'user-agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'
    }
    try:
        url_cont = requests.get(url, headers=headers, timeout=5)
        pg_soup = sp(url_cont.content, "html.parser")
    except requests.ConnectionError as e:
        print(
            "OOPS!! Connection Error. Make sure you are connected to Internet. Technical Details given below.\n"
        )
        print(str(e))
        return dat
    except requests.Timeout as e:
        print("OOPS!! Timeout Error")
        print(str(e))
        return dat
    except requests.RequestException as e:
        print("OOPS!! General Error")
        print(str(e))
        return dat
    except KeyboardInterrupt:
        print("Someone closed the program")
        return dat
    try:
        pgno = pg_soup.find("span", {"pageInfo"}).text.split()
    except AttributeError:
        return dat
    a1 = int(pgno[1]) + 1
    a2 = int(pgno[3])

    p = pg_soup.findAll("div", {"class": "row review_table_row"})

    for x in p:
        review = x.find("div", {"class": "user_review"}).text
        emo = len(x.findAll("span", {"class": "glyphicon glyphicon-star"}))
        if emo == 3:
            continue
        dat.append([review, emo])
    print(len(dat))
    ind = url.find('/?')
    url2 = url[:ind] + "/?page=" + str(a1) + "&type=user"
    print(url2)
    if (int(a1) < int(a2)):
        web_crawl(url2, dat)
    else:
        return dat
def Get_Entity_Description(entry):
    url = "https://baike.baidu.com/item/" + quote(entry)
    req = request.Request(url, headers=header)
    html = request.urlopen(req).read()
    soup = sp(html, "html.parser")
    Description_len = 10
    content = soup.findAll('div', {'class': 'para'})
    if content:
        for idx, i in enumerate(content):
            i = i.get_text()
            i = i.replace('\n', '')
            i = i.replace('\r', '')
            i = i.replace(u'\xa0', u'')
            i = re.sub(bracket, '', i)
            if (len(i) > Description_len):
                break
        return i if len(i) < Max_len else i[:Max_len]
    else:
        return 'NIL'
Ejemplo n.º 22
0
def getTagAndCateByLink(link,k):
    try:
        # from link get blogApp
        p1 ='.com/[\s\S]*/p/'
        a = re.search(p1,link)
        blogApp=a.group()[5:-3]

        # from link get postId
        p2 = '/p/[\s\S]*.html'
        b = re.search(p2, link)
        postId = b.group()[3:-5]

        html = requests.get(link).text
        soup = sp(html, 'lxml')
        l = soup.findAll('script')
        p3 = 'cb_blogId=\d*,'
        for i in l:
            s = str(i)
            c = re.search(p3, s)
            if c is not None:
                blogId = c.group()[10:-1]

        findTagParam['blogApp'] = blogApp
        findTagParam['blogId'] = blogId
        findTagParam['postId'] = postId

        json_ = requests.get(findTagUrl, params=findTagParam).json()
        p4='>(\w*|\W*|[\u4e00-\u9fa5])+</a>'
        text=json_[k]
        t2 = text.split(',')
        # print(d.group()[1:-4])
        for t in t2:
            a = re.search(p4, t)
            b = a.group()[1:-4]
            k = k + b + ','
        if k=='Tags':
            return k[4:-1]
        else:
            return k[10:-1]
    except Exception:
        return ''
Ejemplo n.º 23
0
def get_detail(word):
    value_dict = {}
    url = url_head + word
    req = Request(url, headers=header)
    html = urllib2.urlopen(req).read()
    soup = sp(html, "html.parser")
    list_same = re.findall(pattern_name, html)
    if list_same:
        value_dict["同义词"] = list_same[0]
    list_title = soup.find_all("dt", class_="basicInfo-item name")
    list_value = soup.find_all("dd", class_="basicInfo-item value")
    for index, item in enumerate(list_title):
        title = re.findall(pattern_title, str(item))[0]
        value_list = re.findall(pattern_value, str(list_value[index]))
        if value_list and value_list[0].find("target") <= 0:
            value_dict[title] = value_list[0]
            continue
        elif value_list and value_list[0].find("target") > 0:
            value = re.findall(p_2, str(value_list[0]))
            if value:
                value_dict[title] = value[0]
                continue

        value_list = re.findall(p_2, str(list_value[index]))
        if value_list:
            value = ""
            for item in value_list:
                value += item + ";"
            value = value.strip(";")
            value_dict[title] = value
            continue

        value_list = re.findall(p_3, str(list_value[index]))
        if value_list:
            value_dict[title] = value_list[0]
            continue

    if 0:
        for key, value in value_dict.items():
            print key, value
    return word + "\001" + json.dumps(value_dict)
Ejemplo n.º 24
0
def GetImage(img_url, rawurl):
    root = tk.Tk()
    root.title("FASTAF Music Downloader")
    root.iconbitmap('ytb.ico')
    response = requests.get(img_url)
    img_data = response.content
    image1 = Image.open(BytesIO(img_data))
    image2 = image1.resize((400, 230), Image.ANTIALIAS)
    image3 = ImageTk.PhotoImage(image2)
    canvas = tk.Canvas(root)
    canvas.pack()
    soup = sp(urlopen(rawurl), "lxml")
    Title = soup.title.string
    STitle = Title.replace("- YouTube", "")
    text1 = "Downloaded " + STitle + " !"
    canvas.create_text(180, 250, fill="darkblue", font="Times 10 ", text=text1)
    canvas.create_image(0, 0, anchor=tk.NW, image=image3)
    canvas.update
    BtnSubmit = tk.Button(root, text="Submit",
                          command=partial(down(rawurl))).grid(row=3, column=3)
    BtnSubmit.pack()
    root.mainloop()
Ejemplo n.º 25
0
 def sc2(self):
     pg = r(
         homePage.url3,
         headers={
             'User-Agent':
             'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36'
         })
     pg = uo(pg)
     pg_ht = pg.read()
     pg.close()
     soup = sp(pg_ht, 'html.parser')
     logourl = '<img src="https://www.cs.utah.edu/~deb/assets/images/media/logo_it.png" alt="India Today" class="img-fluid">'
     heads = []
     links = []
     imgs = []
     logos = []
     try:
         imgs1 = soup.find('div', {
             'class': 'view-content'
         }).findAll('div', {'class': 'catagory-listing'})
         for i in imgs1:
             imgs.append(
                 i.find('div', {
                     'class': 'pic'
                 }).find('img').get('src'))
             heads.append(i.find('div', {'class': 'detail'}).find('a').text)
             links.append(
                 i.find('div', {
                     'class': 'detail'
                 }).find('a').get('href'))
             logos.append(logourl)
         news = list(zip(imgs, heads, links, logos))
         return news
     except:
         news = []
         return news
Ejemplo n.º 26
0
 def tc1(self):
     pg = r(
         techPage.url2,
         headers={
             'User-Agent':
             'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36'
         })
     pg = uo(pg)
     pg_ht = pg.read()
     pg.close()
     soup = sp(pg_ht, 'html.parser')
     logourl = '<img src="https://akm-img-a-in.tosshub.com/indiatoday/../sites/all/themes/itg/logo.png?v=1.3" alt="India Today" class="img-fluid">'
     heads = []
     links = []
     imgs = []
     logos = []
     try:
         imgs1 = soup.find('div', {
             'class': 'view-content'
         }).findAll('div', {'class': 'catagory-listing'})
         for i in imgs1:
             imgs.append(
                 i.find('div', {
                     'class': 'pic'
                 }).find('img').get('src'))
             heads.append(i.find('div', {'class': 'detail'}).find('a').text)
             links.append(
                 i.find('div', {
                     'class': 'detail'
                 }).find('a').get('href'))
             logos.append(logourl)
         news = list(zip(imgs, heads, links, logos))
         return news
     except:
         news = []
         return news
Ejemplo n.º 27
0
def scraper():
    # Link
    link = "https://www.olx.ua/poltava"
    chrome_options = webdriver.ChromeOptions()
    chrome_options.binary_location = os.environ.get("GOOGLE_CHROME_BIN")
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--no-sandbox")
    driver = webdriver.Chrome(
        executable_path=os.environ.get("CHROMEDRIVER_PATH"),
        chrome_options=chrome_options)
    driver.get(link)

    jobs_list = []
    #Number of ads that user want ro parse
    jobs_num = int(100)
    #Counter of jobs that have been alreade parced
    jobs_counter = 0
    #Page counter (we start from page 1)
    page_counter = 1
    price_range = [0, 999999999]
    from_price, to_price = price_range
    # Iterationg through pages
    while link != None:
        # Get the number of work
        page_tree = sp(driver.page_source, 'html.parser')
        jobs = page_tree.find_all('tr', {'class': 'wrap'})

        # Iterating through work
        for job in jobs:
            if jobs_counter == jobs_num:
                break

            # Title
            title = job.find('h3').text.strip()

            #Price parsing
            try:
                price = job.find('p', {'class': 'price'}).text.strip()
                price_int = float(''.join(x for x in price
                                          if x.isdigit() or x == '.'))
            except:
                price = 'Не указана.'

            if ((from_price or to_price) != 0):
                if price == 'Не указана.' or from_price > price_int or price_int > to_price:
                    continue

            # Link to the details
            job_link = job.find('a')["href"]
            driver.get(job_link)

            # Finding a button to click in order to unblock telephone number
            try:
                phone_btn = driver.find_element_by_class_name('spoiler')
            except:
                continue
            # Wait until telephone number gets clear
            driver.execute_script("arguments[0].click();", phone_btn)
            wait = WebDriverWait(driver, 10)
            try:
                wait.until_not(
                    ec.text_to_be_present_in_element(
                        (By.CLASS_NAME, 'contactitem'), 'x'))
            except:
                continue
            # Parse job link page
            job_page = sp(driver.page_source, 'html.parser')
            user_since = job_page.find('div', {
                'class': 'quickcontact__user-since'
            }).text
            #Parse heading
            try:
                heading = job_page.select('td.middle > ul > li')[1].text
            except:
                heading = 'Недобавленная рубрика.'
            #Parse phone number
            try:
                phones = job_page.select('div.contactitem')[0].text
                print(phones)
            except:
                # If no phone than ignore this job
                continue
            #Parse username
            try:
                name = driver.find_element_by_class_name(
                    'quickcontact__user-name').text
            except NoSuchElementException:
                name = 'Имя не указано.'

            jobs_list.append({
                'title': title.strip(),
                'phone': phones.strip(),
                'name': name.strip(),
                'heading': heading.strip(),
                'user_since': user_since.strip(),
                'price': price,
                'link': job_link.strip(),
            })
            #Try to find unique Add if no then add it to database
            try:
                JobAdds.objects.get(phone=phones.strip())

            except (JobAdds.MultipleObjectsReturned, JobAdds.DoesNotExist):
                JobAdds.objects.create(title=title.strip(),
                                       link=job_link.strip(),
                                       phone=phones.strip(),
                                       name=name.strip(),
                                       heading=heading.strip(),
                                       price=price,
                                       user_since=user_since.strip())
                jobs_counter += 1
        # Link to another page
        try:
            if jobs_counter == jobs_num:
                break
            page_counter += 1
            link = page_tree.find(
                'a', {'class': '{page:' + str(page_counter) + '}'})['href']
            driver.get(link)
            driver.implicitly_wait(0.3)
        except (NoSuchElementException, IndexError, InvalidArgumentException,
                TypeError):
            link = None

    driver.close()
Ejemplo n.º 28
0
import pandas as pd
import requests
from bs4 import BeautifulSoup as sp



response = requests.get("https://www.flipkart.com/mobiles/pr?sid=tyy,4io&marketplace=FLIPKART")
print(response)
soup = sp(response.content, "html.parser")
Name_list = []
Price_list = []
Rating_list = []

for i in range(1,51):
    link = soup.find("a",text = "Next").get("href")
    home_page_url = "https://www.flipkart.com"
    next_page_link = home_page_url + link[:-1]+str(i)
    response2 = requests.get(next_page_link)
    
    soup2 = sp(response2.content, "html.parser")
    cards = soup2.find_all("div", attrs = {"class": "_1UoZlX"})
    
    for card in cards:
        name = card.find("div", attrs = {"class": "_3wU53n"})
        price = card.find("div",attrs= {"class":"_2rQ-NK"})
        rating = card.find("div",attrs = {"class":"hGSR34"})
        
        if name:
            name_text = name.text
        else:
            name_text = None
import requests
from bs4 import BeautifulSoup as sp

url = "https://www.vulnerability-lab.com/list-of-bug-bounty-programs.php"  # our url for parsing

webpage = requests.get(url=url)  # we make get requests and assign it webpage
soup = sp(webpage.content,
          'html.parser')  # we create soup object for parse webpage content
tables = soup.find_all(
    'table')  # take tables from soup object with find_all methods
a_tags = tables[4].find_all('a')  # take a tags in the table
with open("bug_bounty-sites.txt",
          "w") as sites_list:  # open a file for write url list

    for a in a_tags:
        if "mailto" in a:
            pass
        elif "http" not in a.get("href"):
            sites_list.write("http://" + a.get("href") + "\n")
        else:
            sites_list.write(a.get('href') +
                             "\n")  # write url list after open file
Ejemplo n.º 30
0
def  getReply(opener,url,*args):
    pg = sp(getPg(opener,url,*args),'html.parser')
    tds = pg.findAll('td',{'class':'t_msgfont'})
    replys = [d.text.strip() for d in tds]
    return replys
Ejemplo n.º 31
0
# -*- coding: utf-8 -*-
"""
Created on Sun Dec 20 13:52:30 2020

@author: konstnar
"""

from bs4 import BeautifulSoup as sp
import os, re

file = open('page.html', encoding='utf-8')
file = file.readline()

page = sp(file, 'lxml')
qstn = page.findAll(
    'div',
    class_=
    'freebirdFormviewerViewItemsItemItemTitle exportItemTitle freebirdCustomFont'
)
for q in qstn:
    print(re.sub("\s\*", "", q.text))

#   if answer is in radio button
#   wrong answers will not be retrived  (have to add manually)
ans = page.findAll(
    ['label', 'span'],
    class_=
    'docssharedWizToggleLabeledContainer freebirdFormviewerViewItemsRadioChoice freebirdLabeledControlDarkerDisabled isChecked freebirdFormviewerViewItemsRadioGraded freebirdFormviewerViewItemsRadioCorrect isDisabled'
)

#   if answers is in text box
Ejemplo n.º 32
0
urllib2.install_opener(opener)
# first come by page :login
request2 = urllib2.Request(url,params)
response2 = opener.open(request2)
pp = response2.read()
# second come by page: qiandao
qiandao_rq = urllib2.Request(qiandaourl,urllib.urlencode(qd_info))
rsp3 = opener.open(qiandao_rq)
qdao_page = rsp3.read()

#   regular expression: acquire info
rx=re.compile(r'class="postbox">\r\n(.+?)</div')
qdresult = re.findall(rx,qdao_page)
#   print info
#print str(qdresult[0]).decode('utf-8').encode('gbk')
#print qdao_page
print qdresult[0].decode('utf-8')

##formhash = re.findall('name=\"formhash\" value=\"([0-9a-f]*)\" ',s)

uy = 'http://130.211.8.178/'

su1 = 'http://130.211.8.178/forum-19-1.html'#88-6
pg = sp(getPg(opener,su1),'html.parser')
ths = pg.findAll('th',{'class':'subject common'})
#rs = [d.span.a['href'] for d in ths]
for d in ths[0:-1:2]:
    print d.span.text
    h=uy + d.span.a['href']
    doReply(opener,h)
    sleep(20)
Ejemplo n.º 33
0
def openFile():
    file = open("corpus_berita.txt", encoding="utf8")
    soup = sp(file, 'html.parser')
    doc_berita = soup.find_all("doc")
    return doc_berita
Ejemplo n.º 34
0
import urllib2 as ur
from bs4 import BeautifulSoup as sp

#url = "http://gucky.uni-muenster.de/cgi-bin/rgbtab-en"
url = "http://cvsweb.xfree86.org/cvsweb/*checkout*/xc/programs/rgb/rgb.txt?rev=1.1"

page = ur.urlopen(url).read()
soup = sp(page)

colors = soup.body.string
import re

pattern = re.compile('(\d+ \d+ \d+)')
rgb_colors = pattern.findall(colors)
rgb_uniq = []
for color in rgb_colors:
  if color not in rgb_uniq: rgb_uniq.append(color)