Python CrawlerUtils Examples

Programming Language: Python

Namespace/Package Name: crawler_utils

Class/Type: CrawlerUtils

Examples at hotexamples.com: 2

Python CrawlerUtils - 2 examples found. These are the top rated real world Python examples of crawler_utils.CrawlerUtils extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

clean_description(1)

findLatLon(1)

getAddress(1)

getLastUpdated(1)

getReceivedDate(1)

getStatuses(1)

getWard(1)

Example #1

Show file

File: milieu_spider.py Project: AzDarGee/web-crawler

def get_dev_ids(since_date):
    dev_ids = {}
    url = 'http://ottwatch.ca/devapps?since='+str(since_date)
    source_code = requests.get(url)
    plain_text = source_code.text
    soup = BeautifulSoup(plain_text, 'html.parser')
    counter = 0
    for row_fluid in soup.findAll('div',{'class': 'row-fluid'}):
        for span_5 in row_fluid.findAll('div', {'class': 'span5'}):
            for app_row in span_5.div.table.find_all('tr'):
                # Get Lat/Lon Coords
                if (app_row.findAll('td')):
                    if (app_row.find_all('td')[2].a != None):
                        latLon = app_row.find_all('td')[2].a.get('href')
                        latLon = CU.findLatLon(latLon)
                        dev_ids[counter] = {}
                        dev_ids[counter]['allInfo'] = {}
                        dev_ids[counter]['allInfo']['lat'] = latLon[0]
                        dev_ids[counter]['allInfo']['lon'] = latLon[1]
                # Get DevId for each application
                if (app_row.find('td') != None):
                    contents = app_row.find('td')
                    dev_ids[counter]['devID'] = contents.b.nobr.a.string
                counter+=1
    # print(dev_ids)
    return dev_ids

Example #2

Show file

File: milieu_spider.py Project: AzDarGee/web-crawler

def get_each_dev_app_info(dev_ids):
    for devId in dev_ids:
        url = 'http://ottwatch.ca/devapps/'+str(dev_ids[devId]['devID'])
        source_code = requests.get(url)
        plain_text = source_code.text
        soup = BeautifulSoup(plain_text, 'html.parser')
        lxml_data = lxml.html.fromstring(plain_text)

        #  Goes through both row_fluid on the page
        for row_fluid in soup.findAll('div', {'class': 'row-fluid'}):
            # This is going through both span 6's on the page
            for span_6 in row_fluid.findAll('div', {'class': 'span6'}):

                if (span_6.p):
                    # Development application Type
                    dev_ids[devId]['allInfo']['appType'] = span_6.p.contents[1].string

                    # Development application description
                    cleaned = CU.clean_description(span_6.p.contents[2].string)
                    dev_ids[devId]['allInfo']['description'] = cleaned

                    # Application on City of Ottawa Website
                    dev_ids[devId]['allInfo']['city_url'] = span_6.p.contents[3].a.get('href')

                # MAIN Table
                if (span_6.table):
                    table = span_6.table

                    # print(table.contents[9]) ## Docs Table
                    # print(table.contents[1])

                    ## WARDS ##
                    devIds[devId]['allInfo']['wardNum'], devIds[devId][
                        'allInfo']['wardName'], devIds[devId]['allInfo'][
                        'wardCouncillor'] = CU.getWard(table.contents[1].text)

                    ## UPDATED DATE ##
                    devIds[devId]['allInfo']['receievedDate'] = \
                        CU.getReceivedDate(table.contents[3].text)

                    ## UPDATED DATE ##
                    devIds[devId]['allInfo']['lastUpdated'] = CU.getLastUpdated(
                        table.contents[5].text)

                    ## ADDRESSES ##
                    devIds[devId]['allInfo']['address'] = CU.getAddress(
                        table.contents[7].text)

                    ## Related Documents ##
                    if table.contents[9].table:
                        docCount = 0
                        devIds[devId]['allInfo']['relatedDocuments']= {}
                        for doc in table.contents[9].table.findAll('td'):
                            devIds[devId]['allInfo']['relatedDocuments'][
                                docCount] = {}
                            if doc.a:
                                docName = doc.a.text
                                docLink = doc.a.get('href')
                                devIds[devId]['allInfo']['relatedDocuments'][
                                    docCount]['name'] = docName
                                devIds[devId]['allInfo']['relatedDocuments'][
                                    docCount]['link'] = docLink
                                docCount += 1
                            elif doc.nobr:
                                docDate = doc.nobr.text
                                devIds[devId]['allInfo']['relatedDocuments'][
                                    docCount-1]['date'] = docDate

                    ## Application Lifescycle Statuses ##
                    devIds[devId]['allInfo']['statuses'] = CU.getStatuses(
                        devIds[devId]['devID'])


    print(devIds)