Exemple #1
0
def getprice(product):
    site = "mysmartprice"

    start = "'href': '/url?q="
    end = "&sa"
    links = linkGrabber.Links(
        'https://www.google.co.in/search?newwindow=1&biw=1366&bih=659&q=' +
        site + '+' + product)
    gb = links.find(limit=30)
    print(gb[25])
    gb1 = str(gb)
    frame = pd.DataFrame()
    gb2 = re.search("http://www.mysmartprice.com/(.+?)(%|&)", gb1)
    #print(gb2.group(1))
    if gb2:
        found2 = gb2.group(1)
        print(found2)
        frame = findprice(found2)

    else:
        print('Not found')
    return frame


#getprice()
def get_links(url):
    try:
        links = linkGrabber.Links(url)
        l = links.find()
    except Exception:
        l = ''
    return l
Exemple #3
0
def findgitlink(url, identifier):
    links = linkGrabber.Links(url)
    print('url: ' + url)
    gb = links.find()

    for item in gb:
        print(item['href'])
        if item['href'].startswith(identifier):
            return item['href']
def getlink(product):
    #pd.set_option('display.width',1000)
    pd.options.display.max_colwidth = 200
    site = "amazon.in"
    #product ="NEXUS 5"
    start = "'href': '/url?q="
    end = "&sa"
    frame = pd.DataFrame()
    links = linkGrabber.Links(
        'https://www.google.com/search?newwindow=1&biw=1366&bih=659&q=' +
        site + '+' + product + '+product+reviews')
    gb = links.find(limit=30)
    print("~~~~~~~~~~~~print g2[25]~~~~~~~~~~~~~~~ ")
    print(gb[25])
    gb1 = str(gb)
    print("~~~~~~~~~~~print g2 group(1)~~~~~~~~~~~~~")
    print(str(gb1))

    gb2 = re.search("https://www.amazon.com/(.+?)(%|&)", gb1)
    print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
    print("LINK GRABBED IS: ")
    print(gb2)
    print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
    print("print g2 group(1)")
    print(gb2.group(1))
    if gb2:
        '''found = gb2.group(1)
		print(found[-10:])
		found1=found+'xyz'
		#time.sleep(4)
		gb3 = re.search('/(.+?)xyz',found1)
		found2=gb3.group(1)
		print(found2)
		time.sleep(500)
		frame =scrap(found2)'''
        found = gb2.group(1)
        print(found[-10:])
        frame = scrap(found[-10:])
    else:
        print('Not found')
    return frame
def getlink(product):
    site = "snapdeal"
    start = "'href': '/url?q="
    end = "&sa"
    links = linkGrabber.Links(
        'https://www.google.co.in/search?newwindow=1&biw=1366&bih=659&q=' +
        site + '+' + product + '+reviews')
    gb = links.find(limit=30)
    gb1 = str(gb)
    print(gb1)
    frame = pd.DataFrame()
    f = open("link2.txt", "w+")
    print(f, gb1)
    gb2 = re.search("www." + site + ".com(.+?)&", gb1)

    if gb2:
        found = gb2.group(1)
        link = "https://www." + site + ".com" + found
        print(link)
        frame = scrap(link)
    else:
        print('not found')
    return frame
def getlink(txt):
    site = "flipkat"
    product = txt
    start = "'href': '/url?q="
    end = "&sa"
    links = linkGrabber.Links(
        'https://www.google.co.in/search?newwindow=1&biw=1366&bih=659&q=' +
        site + '+' + product + '+product+details')
    gb = links.find(limit=100)
    print(gb[25])
    gb1 = str(gb)
    frame = pd.DataFrame()
    gb2 = re.search("http://www.flipkart.com(.+?)(%|&)", gb1)

    if gb2:
        found2 = gb2.group(1)
        print(found2)
        #time.sleep(500)
        frame = scrap(found2)

    else:
        print('Not found')
    return frame
Exemple #7
0
import re
import linkGrabber

links = linkGrabber.Links(
    'https://www.google.com/search?client=firefox-b-d&q=king')
gb = links.find(limit=4, duplicates=False, pretty=True)
print(gb)
Exemple #8
0
import linkGrabber

links = linkGrabber.Links(r"http://google.com")
gb = links.find(limit=4, duplicates=False, pretty=True)
print(gb)
Exemple #9
0
import re
import linkGrabber

links = linkGrabber.Links("https://devx.work")
#gb = links.find(limit=10, duplicates= False, pretty=True)
gb = links.find(pretty=True)

print(gb)
Exemple #10
0
import re
import linkGrabber

links = linkGrabber.Links("https://www.google.com/")
gb = links.find(limit=5, duplicates=False, pretty=True)
print(gb)
Exemple #11
0
import re
import linkGrabber
import pprint

links = linkGrabber.Links('http://www.xkcd.com/')
gb = links.find(limit=4, duplicates=False, pretty=True)

# I think i need some regular expressions under here but I'm still trying to
# learn them... But this works to get the URLs
Exemple #12
0
            "www.google.com",
            "www.airbnb.com",
            # "www.hotels.com",
            # "www.amazon.com",
            "www.cars.com",
            "www.twitch.tv",
            "store.steampowered.com",
            "www.reuters.com",
            # "imgur.com",
            # "www.lowes.com",
            "www.cbssports.com",
            "www.nfl.com",
            # "www.expedia.com",
            "www.walmart.com",
            "www.wayfair.com",
            "bing.com",
            # "reddit.com",
        ]
        for u in initial_domains:
            print(u)
            links = linkGrabber.Links(f"https://{u}/")
            l = links.find(limit=100,
                           href=re.compile("//" + u),
                           duplicates=False)
            _c = lambda x: x if x.startswith("http") else "https:" + x
            paths.extend([_c(e["href"]) for e in l])
    d = DomDataset(DATA_DIR + "/dom-dataset", paths)
    if download:
        d.download()
    d.process()
Exemple #13
0
import re
import linkGrabber
import ssl

gcontext = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
#url = "http://mhrd.gov.in"
url = "http://upfireservice.gov.in/"
links = linkGrabber.Links(url)
gb = links.find(duplicate=False)
count = 0
for rec in gb:
    url_text = rec['text']
    url_href = rec['href']
    if '.pdf' in url_href:
        count += 1
        if not url_href.startswith('http'):
            url_href = url + url_href

        #print url_text
        print url_href
print count
Exemple #14
0
import linkGrabber

links = linkGrabber.Links(
    'http://allrecipes.com/recipes/276/desserts/cakes/?page=3#2')
gb = links.find(limit=10,
                duplicates=False,
                pretty=True, {'class': 'pinterest'})
print(gb)
Exemple #15
0
    g = Github()
    repo = g.get_repo("mdadams/jasper")

    commit = repo.get_commit(sha=git_link.rsplit('/', 1)[-1])
    parent_sha = commit.commit.parents[0].sha
    print("sha : " + parent_sha)

directory = "C:\LabWork\\" + parent_sha

git_issue_download_link = ''

if git_issue_link is None or git_issue_link == '':
    print('The git issue link is not found, cant reproduce the exploit')

else:
    links = linkGrabber.Links(git_issue_link)
    gb = links.find()

    for item in gb:
        if item['href'].endswith('.zip'):
            git_issue_download_link = item['href']

if not os.path.exists(directory):
    os.makedirs(directory)

if len(os.listdir(directory)) == 0:
    repo = Repo.clone_from(url="https://github.com/mdadams/jasper.git",
                           to_path="C:\LabWork" + "\\" + parent_sha)
    print("Repository cloned")
    print("Checking out to the wanted commit")
    repo.git.checkout(parent_sha)
Exemple #16
0
import re
import linkGrabber

#                                                        Explaination
# in this example we are going to grab all urls and links inside the page . this is used in web surfing and crawling scenarios here we are not using selenium
#here we use linkGrabber library which is not in python packages  by default so we install it using pip open your cmd and type the below command
#----------------------------------------------------------------------------------------------------------------------------------------------
#
#                                                    pip install linkGrabber
#
#----------------------------------------------------------------------------------------------------------------------------------------------
#Parameters:
#* filters (dict): Beautiful Soup's filters as a dictionary
#* limit (int): Limit the number of links in sequential order
#* reverse (bool): Reverses how the list of <a> tags are sorted
#* sort (function): Accepts a function that accepts which key to sort upon
#within the List class

links = linkGrabber.Links('https://google.com')
gb = links.find(limit=8, duplicates=False, pretty=True)

print(gb)
Exemple #17
0
import re
import linkGrabber

links = linkGrabber.Links("https://www.pitchvision.com/#/")
gb = links.find(pretty=True)

print(gb)

Exemple #18
0
# a python script to read all the links from a webpage and print them

import urllib2
import re
import linkGrabber

links = linkGrabber.Links(
    "https://www.indeed.com/jobs?q=devops+engineer&sort=date")
gb = links.find(Limit=20, pretty=True)

print(gb)