Esempio n. 1
0
def get_emails(website, max_depth):
    """Returns a list of emails found at this website  
  
max_depth is how deep to follow links  
"""
    D = download.Download()
    return D.get_emails(website, max_depth=max_depth)
Esempio n. 2
0
def EmailScrape(email):
    D = download.Download()
    emails = D.get_emails("{}".format(email),
                          max_depth=1,
                          max_urls=None,
                          max_emails=25)
    return emails
def download_locations():
    """Find latitude longitude bounding box for this country
    """
    D = download.Download(num_retries=1)
    index_url = 'http://download.geonames.org/export/zip/'
    index_html = D.get(index_url)
    for link in xpath.search(index_html, '//pre/a/@href'):
        if link.endswith(
                '.zip') and '_full' not in link and 'allCountries' not in link:
            download_html = D.get(urlparse.urljoin(index_url, link))
            input_zip = StringIO.StringIO()
            input_zip.write(download_html)
            try:
                tsv_data = zipfile.ZipFile(input_zip).read(
                    link.replace('.zip', '.txt'))
            except zipfile.BadZipfile as e:
                print e
                del D.cache[urlparse.urljoin(index_url, link)]
                continue
            output_filename = link.replace('.zip', '_locations.csv')
            writer = csv.writer(open(output_filename, 'w'))
            found = set()
            for row in csv.reader(tsv_data.splitlines(), delimiter='\t'):
                zip_code = row[1] = row[1].split('-')[0]
                try:
                    lat, lng = float(row[9]), float(row[10])
                except ValueError:
                    print 'bad coord:', row[9], row[10]
                else:
                    if lat and lng and zip_code not in found:
                        found.add(zip_code)
                        place = row[2]
                        writer.writerow([place, zip_code, lat, lng])
            print 'Downloaded to', output_filename
def politician_and_org(politician_name):
    wiki_url = 'http://en.wikipedia.org/wiki/%s' %'_'.join(politician_name.split())
    html = download.Download().fetch(wiki_url)
    html = common.remove_tags(BeautifulSoup(html, "lxml").text)
    html = " ".join(html.split()).lower()

    doc = nlp(html)
    return doc.ents
Esempio n. 5
0
def extract(url):
    '''
    Function that extracts product info from websites
    listed in the csv page . It takes the url as an argument.
    '''
    try:
        url = url.encode('utf-8')

        D = download.Download()

        try:
            xpath_input_file = open(os.path.join(os.path.dirname(__file__),
                                                 'webpage_xpath.csv'), 'rb')
            # Joining absolute path so that the function can be anywhere
        except IOError:
            # Checking for IO exceptions, i.e if the file exists or not
            print("An error occured while reading the csv file,\
                  check your Directory again")
            sys.exit()

        reader = csv.reader(xpath_input_file)
        row = list(reader)
        item_info = {}
        for r in range(0, len(row)):
            if url.find(row[r][0]) >= 0 and url.find(row[r][4]) >= 0:
            # Checks if the url fiven is correct or not
            # Fails in the case the url is of given site but not a prduct url
                xpath1 = row[r][1]
                xpath2 = row[r][2]
                xpath3 = row[r][3]

                html = D.get(url)
                # Webpage downloads after validation
                item_info['name'] = xpath.get(html, '%s//text()' % xpath1).strip()
                item_info['price'] = xpath.get(html, '%s//text()' % xpath2)
                item_info['image'] = xpath.get(html, '%s' % xpath3).strip()
                return item_info

            else:
                continue

        if item_info == {}:
            raise InvalidurlError("Enter a valid product url")
            sys.exit()

        xpath_input_file.close()
        sys.exit()

    except KeyboardInterrupt:
        raise
        sys.exit()
Esempio n. 6
0
 def __init__(self, name):
     self.name = name
     self.normalizedName = self.name.lower().replace(' ', '-')
     self.url = '%s/%s/bushalte-%s' % (baseURL, place, self.normalizedName)
     self.downloader = download.Download()
     jsonFile = open(jsonName % self.normalizedName, 'r+')
     jsonString = json.load(jsonFile)
     jsonFile.close()
     self.id = jsonString['busstop-id']
     self.lines = dict()
     for line in jsonString['lines']:
         self.lines[str(line['number']) + line['destination']] = BusLine(
             self, line)
     self.update()
Esempio n. 7
0
def scrape_title(num, typ):
    f = open('title_%s.txt' % typ, 'w')
    D = download.Download(read_cache=False)

    key = MAN if typ == 'MAN' else QASH
    url = 'https://etherscan.io/token/%s' % key

    html = D.get(url)
    ts = common.regex_get(html, r'Total\sSupply\:[^<]*</td>[^<]*<td>([^<]+)<')
    vt = common.regex_get(html,
                          r'Value\sper\sToken\:[^<]*</td>[^<]*<td>([^<]+)<')
    th = common.regex_get(html, r'Token\sHolders\:[^<]*</td>[^<]*<td>([^<]+)<')
    f.write('Total Supply: %s\n' % ts)
    f.write('Value per Token: %s\n' % vt)
    f.write('Token Holders: %s\n' % th)
    f.write('No.Of.Transfers: %s\n' % num)
Esempio n. 8
0
def scrapeBB(gamename):
    BB = download.Download(user_agent=None)

    search = gamename
    search = search.replace(" ", "+")

    html = BB.fetch(
        "http://www.bestbuy.com/site/searchpage.jsp?_dyncharset=UTF-8&id=pcat17071&type=page&ks=960&st={}&sc=Global&cp=1&sp=&qp=category_facet%3DVideo+Games~abcat0700000&list=y&usc=All+Categories&nrp=15&iht=n&seeAll="
        .format(search))
    if not html:
        nobb = 1
        print("Couldn't connect to Best Buy's servers.")
        return nobb

    gametitle = xpath.search(html, '//h3[@itemprop="name"]//a')
    productlinks = xpath.search(html, '//h3[@itemprop="name"]//a/@href')
    gameprice = xpath.search(html, '//span[@itemprop="price"]')

    return (gametitle, productlinks, gameprice)
Esempio n. 9
0
def scrapeGamestop(gamename):
    GS = download.Download()

    search = gamename
    search = search.replace(" ", "+")

    html = GS.fetch(
        "http://www.gamestop.com/browse?nav=16k-3-{},28zu0".format(search))
    if not html:
        nogs = 1
        print("Couldn't connect to Gamestop's servers.")
        return nogs

    gametitle = xpath.search(html,
                             '//div[@class="product_info grid_12"]//a[1]')
    productlinks = xpath.search(
        html, '//div[@class="product_info grid_12"]//a[1]/@href')
    gameprice = xpath.search(html, '//p[@class="pricing"]')

    return (gametitle, productlinks, gameprice)
Esempio n. 10
0
def scrapeAmazon(gamename):
    AMA = download.Download(user_agent=None)

    search = gamename
    search = search.replace(" ", "+")

    html = AMA.fetch(
        "http://www.amazon.com/gp/search/ref=sr_il_ti_videogames?rh=n%3A468642%2Ck%3A{}&keywords={}&ie=UTF8&qid=1407988315&lo=videogames"
        .format(search, search))
    if not html:
        noamazon = 1
        print("Couldn't connect to Amazon's servers.")
        return noamazon

    gametitle = xpath.search(
        html, '//div[@class="ilt3"]//a//span[@class="lrg bold"]')
    productlinks = xpath.search(html, '//div[@class="ilt3"]//a/@href')
    gameprice = xpath.search(html,
                             '//div[@class="ill3"]//span[@class="red bld"]')

    return (gametitle, productlinks, gameprice)
Esempio n. 11
0
def incr_database(conn):
    # csi
    D = download.Download(delay=0, read_cache=None, write_cache=None)
    data = []
    csi = []
    src = 'http://www.csindex.com.cn/zh-CN/indices/index-detail/'
    for i in open('stocks.csv'):
        code = i.split('\t')[0]
        if 'CSI' in i or '000985' in i:
            url = src + code
            html = D.get(url)
            trddate = common.regex_get(html, r'截止日期:([^<]+)<')
            if trddate:
                trddate = trddate.replace('-', '')
            m = xpath.search(html,
                             r'//table[@class="table\stc"]/tr/td',
                             remove=None)
            close = m[0] if m else None
            change = m[1] if m and len(m) > 1 else None
            sql = ''' 
                     REPLACE INTO quote_csi(code, close, date, chg) VALUES('%s',%s,%s,%s);
            ''' % (code, close, trddate, change)
            conn.execute(sql)
        else:
            today = datetime.today().strftime('%Y-%m-%d')
            engine = create_engine(
                'mysql://*****:*****@localhost:3306/dige', echo=False)
            try:
                df = ts.get_k_data(code,
                                   ktype='D',
                                   index=True,
                                   start=today,
                                   end=today)
                if not df.empty:
                    sql = ''' delete from quote_nocsi where code like '%%%s%%' and date = '%s' ''' % (
                        code, today)
                    conn.execute(sql)
                    df.to_sql('quote_nocsi', engine, if_exists='append')
            except Exception, e:
                print e
from webscraping import download

D = download.Download()

emails = D.get_emails("http://buklijas.info/",
                      max_depth=2,
                      max_urls=None,
                      max_emails=None)

print(emails)
Esempio n. 13
0
from webscraping import download, alg

Dobj = download.Download()
html = Dobj.get("http://www.sharing55tories.blogspot.com/",max_depth=1000, max_urls=None, max_emails=None)

emails = alg.extract_emails(html)

print emails
import os
import sys
from optparse import OptionParser
import re
import datetime
import webbrowser
from webscraping import common, download, webkit, xpath

DELAY = 5  # delay between downloads
IMAGE_DIR = 'images'  # directory to store screenshots
D = download.Download(delay=DELAY, num_retries=1)


def historical_screenshots(website, days):
    """Download screenshots for website since archive.org started crawling

    website:
        the website to generate screenshots for
    days:
        the number of days difference between archived pages

    Returns a list of the downloaded screenshots
    """
    # the earliest archived time
    t0 = get_earliest_crawl(website)
    print 'Earliest version:', t0
    # the current time
    t1 = datetime.datetime.now()
    delta = datetime.timedelta(days=days)
    wb = webkit.WebkitBrowser(gui=True, enable_plugins=True, load_images=True)
Esempio n. 15
0
 def downloadPage(self, url):
     D = download.Download(read_cache=False, write_cache=False)
     return D.get(url)