Example #1
0
    def process_row (self, row, task):

        l=ItemLoader (PA_DrillingPermit())
        l.Well_Type_in = lambda slist: [s[:20] for s in slist]

        #l.add_value ('County_Name', row['COUNTY_NAME'])
        l.add_value ('County_Name', row['COUNTY'])
        #l.add_value ('Municipality_Name', row['MUNICIPALITY_NAME'])
        l.add_value ('Municipality_Name', row['MUNICIPALITY'])
        l.add_value ('Auth_Id', row['AUTHORIZATION_ID'])
        l.add_value ('Date_Disposed', self.parse_date(row['PERMIT_ISSUED_DATE']))
        l.add_value ('Appl_Type_Code', row['APPLICATION_TYPE'])
        l.add_value ('Auth_Type_Description', row['AUTH_TYPE_DESCRIPTION'])
        l.add_value ('Complete_API_', row['WELL_API'])
        l.add_value ('Other_Id', self.base_api(row['WELL_API']))
#        l.add_value ('Marcellus_Shale_Well', row['MARCELLUS_SHALE_IND'])
        #l.add_value ('Horizontal_Well', row['HORIZONTAL_WELL_IND'])
        if row['CONFIGURATION'] in ("Horizontal Well", "Deviated Well"):
            horiz = 'Y'
        else:
            horiz = 'N'
            if row['CONFIGURATION'] not in ("Vertical Well",):
                self.log("Unknown PA Configuration: {0}."
                         .format(row['CONFIGURATION']), log.INFO)
        l.add_value ('Horizontal_Well', horiz)
        l.add_value ('Well_Type', row['WELL_TYPE'])
        l.add_value ('Site_Name', row['FARM_NAME'])
        l.add_value ('Latitude_Decimal', row['LATITUDE_DECIMAL'])
        l.add_value ('Longitude_Decimal', row['LONGITUDE_DECIMAL'])
        l.add_value ('Client_Id', row['CLIENT_ID'])
        l.add_value ('Operator', row['OPERATOR'])
        l.add_value ('Address1', row['OPERATOR_ADDRESS'])
        l.add_value ('City', row['CITY'])
        l.add_value ('State_Code', row['STATE'])
        l.add_value ('Zip_Code', row['ZIP_CODE'])

        l.add_value ('Unconventional', row['UNCONVENTIONAL'])
        l.add_value ('OGO_Num', row['OGO_NUM'])
        #l.add_value ('Facility_Id', row['PRIMARY_FACILITY_ID'])
        l.add_value ('Facility_Id', row['PRMRY_FAC_ID'])

        item = l.load_item()
        if item['Complete_API_'] and item ['Date_Disposed']:
            stats = self.crawler.stats
            existing_item = self.db.loadItem (item, {'Complete_API_': item['Complete_API_'], 'Date_Disposed': item ['Date_Disposed']})

            if existing_item:
#                diff = item.contentDiff (existing_item)

#                if diff:
#                    self.send_alert ('PA Permit values in %s have changed since previous scrape\n\n%s' % (item, diff))
#                    self.log ('PA Permit values in %s have changed since previous scrape\n\n%s' % (item, diff), log.ERROR)
#                    stats.inc_value ('_error_count', spider=self)
#                else:
#                    self.log('Skipping existing item %s' % (item), log.DEBUG)
#                    stats.inc_value ('_unchanged_count', spider=self)

                stats.inc_value ('_existing_count', spider=self)
            else:
                stats.inc_value ('_new_count', spider=self)
                yield item

                params = dict(item)
                for f in item.fields:
                    params[f] = escape ("%s" % params.get(f,''))

                params['Appl_Type_Code'] = self.get_appl_type(item)
                params['Well_Type'] = self.get_well_type(item)

                # create a new feed item
                l=ItemLoader (FeedEntry())

                url = "%s/%s/%s" % (task['target_url'], item['Complete_API_'], item ['Date_Disposed'])
                #feed_entry_id = uuid.uuid3(uuid.NAMESPACE_URL, url.encode('ASCII'))
                feed_entry_id = self.db.uuid3_str(name=url.encode('ASCII'))
                l.add_value ('id', feed_entry_id)
                l.add_value ('title', "PA %s Drilling Permit Issued in %s Township" % (params.get('Well_Type'), item.get('Municipality_Name') ))
#                l.add_value ('updated', item.get('Date_Disposed'))
                l.add_value ('incident_datetime', item.get('Date_Disposed'))
                l.add_value ('link', task['about_url'])


                l.add_value ('summary', self.summary_template().substitute(params))
                l.add_value ('content', self.content_template().substitute(params))

                l.add_value ('lat', item.get('Latitude_Decimal'))
                l.add_value ('lng', item.get('Longitude_Decimal'))
                l.add_value ('source_id', 4)

                feed_item = l.load_item()

                if feed_item.get('lat') and feed_item.get('lng'):
                    yield feed_item

                    yield self.create_tag (feed_entry_id, 'PADEP')
                    yield self.create_tag (feed_entry_id, 'frack')
                    yield self.create_tag (feed_entry_id, 'permit')
                    yield self.create_tag (feed_entry_id, 'drilling')
                    if item.get('Marcellus_Shale_Well') == 'Y':
                        yield self.create_tag (feed_entry_id, 'marcellus')
                    well_type = params.get('Well_Type')
                    if well_type:
                        yield self.create_tag (feed_entry_id, well_type)
Example #2
0
    def process_row (self, row, task):

        l=ItemLoader (PA_Spud())
        l.County_in = lambda slist: [s[:20] for s in slist]
        l.Municipality_in = lambda slist: [s[:20] for s in slist]
        l.Created_By_in = lambda slist: [s[:20] for s in slist]
        l.Modified_By_in = lambda slist: [s[:20] for s in slist]
        l.Well_Type_in = lambda slist: [s[:20] for s in slist]

        #l.add_value ('OGO__', row['OPERATOR_OGO_NUM'])
        l.add_value ('OGO__', row['OGO_NUM'])
        l.add_value ('SPUD_Date', self.parse_date(row['SPUD_DATE']))
        l.add_value ('County', row['COUNTY'])
        l.add_value ('Municipality', row['MUNICIPALITY'])
        l.add_value ('Operator_s_Name', row['OPERATOR'])
        l.add_value ('Farm_Name', row['FARM_NAME'])
        #l.add_value ('Well_Number', row['WELL_NUM'])
        l.add_value ('Well_Number', '')  # Now included in FARM_NAME
        l.add_value ('Latitude', row['LATITUDE'])
        l.add_value ('Longitude', row['LONGITUDE'])
#        l.add_value ('Marcellus_Ind_', row['MARCELLUS_IND'])
        #l.add_value ('Horizontal_Ind_', row['HORIZONTAL_WELL_IND'])
        if row['CONFIGURATION'] in ("Horizontal Well", "Deviated Well"):
            horiz = 'Y'
        else:
            horiz = 'N'
            if row['CONFIGURATION'] not in ("Vertical Well",):
                self.log("Unknown PA Configuration: {0}."
                         .format(row['CONFIGURATION']), log.INFO)
        l.add_value ('Horizontal_Ind_', horiz)
        #l.add_value ('Creation_Date', self.parse_date(row['CREATED_DATE']))
        #l.add_value ('Created_By', row['CREATED_BY'])
        #l.add_value ('Modification_Date', self.parse_date(row['MODIFIED_DATE']))
        #l.add_value ('Modified_By', row['MODIFIED_BY'])

        #l.add_value ('Well_Type', row['WELL_TYPE'])
        l.add_value ('Well_Type', row['WELL_CODE_DESC'])

        l.add_value ('Unconventional', row['UNCONVENTIONAL'])
        l.add_value ('Region', row['REGION'])

        #l.add_value ('Well_API__', '37-%s-00-00' % row['PERMIT_NUMBER'])
        l.add_value ('Well_API__', '37-%s-00-00' % row['API'])


        item = l.load_item()
        if item['Well_API__'] and item ['SPUD_Date']:
            stats = self.crawler.stats
            existing_item = self.db.loadItem (item, {'Well_API__': item['Well_API__'], 'SPUD_Date': item ['SPUD_Date']})

            if existing_item:
                stats.inc_value ('_existing_count', spider=self)
            else:
                stats.inc_value ('_new_count', spider=self)
                yield item

                params = dict(item)
                for f in item.fields:
                    params[f] = escape ("%s" % params.get(f,''))

                if task.get('no_alert'):
                    pass
                else:
                    # create a new feed item
                    l=ItemLoader (FeedEntry())

                    url = "%s/%s/%s" % (task['target_url'], item['Well_API__'], item ['SPUD_Date'])
                    #feed_entry_id = uuid.uuid3(uuid.NAMESPACE_URL, url.encode('ASCII'))
                    feed_entry_id = self.db.uuid3_str(name=url.encode('ASCII'))
                    l.add_value ('id', feed_entry_id)
                    l.add_value ('title', "%s Reports Drilling Started (SPUD) in %s Township" % (item.get('Operator_s_Name'), item.get('Municipality') ))
                    l.add_value ('incident_datetime', item.get('SPUD_Date'))
                    l.add_value ('link', task['about_url'])


                    l.add_value ('summary', self.summary_template().substitute(params))
                    l.add_value ('content', self.content_template().substitute(params))

                    l.add_value ('lat', item.get('Latitude'))
                    l.add_value ('lng', item.get('Longitude'))
                    l.add_value ('source_id', 5)

                    feed_item = l.load_item()

                    if feed_item.get('lat') and feed_item.get('lng'):
                        yield feed_item

                        yield self.create_tag (feed_entry_id, 'PADEP')
                        yield self.create_tag (feed_entry_id, 'frack')
                        yield self.create_tag (feed_entry_id, 'spud')
                        yield self.create_tag (feed_entry_id, 'drilling')
                        well_type = item.get('Well_Type')
                        if well_type:
                            yield self.create_tag (feed_entry_id, well_type.lower())
                        if item.get('Unconventional') == 'Yes':
                            yield self.create_tag (feed_entry_id, 'unconventional')
Example #3
0
    def process_row(self, row, task):

        l = ItemLoader(PA_Spud())
        l.County_in = lambda slist: [s[:20] for s in slist]
        l.Municipality_in = lambda slist: [s[:20] for s in slist]
        l.Created_By_in = lambda slist: [s[:20] for s in slist]
        l.Modified_By_in = lambda slist: [s[:20] for s in slist]
        l.Well_Type_in = lambda slist: [s[:20] for s in slist]

        # l.add_value ('OGO__', row['OPERATOR_OGO_NUM'])
        l.add_value("OGO__", row["OGO_NUM"])
        l.add_value("SPUD_Date", self.parse_date(row["SPUD_DATE"]))
        l.add_value("County", row["COUNTY"])
        l.add_value("Municipality", row["MUNICIPALITY"])
        l.add_value("Operator_s_Name", row["OPERATOR"])
        l.add_value("Farm_Name", row["FARM_NAME"])
        # l.add_value ('Well_Number', row['WELL_NUM'])
        l.add_value("Well_Number", "")  # Now included in FARM_NAME
        l.add_value("Latitude", row["LATITUDE"])
        l.add_value("Longitude", row["LONGITUDE"])
        #        l.add_value ('Marcellus_Ind_', row['MARCELLUS_IND'])
        # l.add_value ('Horizontal_Ind_', row['HORIZONTAL_WELL_IND'])
        if row["CONFIGURATION"] in ("Horizontal Well", "Deviated Well"):
            horiz = "Y"
        else:
            horiz = "N"
            if row["CONFIGURATION"] not in ("Vertical Well",):
                self.log("Unknown PA Configuration: {0}.".format(row["CONFIGURATION"]), log.INFO)
        l.add_value("Horizontal_Ind_", horiz)
        # l.add_value ('Creation_Date', self.parse_date(row['CREATED_DATE']))
        # l.add_value ('Created_By', row['CREATED_BY'])
        # l.add_value ('Modification_Date', self.parse_date(row['MODIFIED_DATE']))
        # l.add_value ('Modified_By', row['MODIFIED_BY'])

        # l.add_value ('Well_Type', row['WELL_TYPE'])
        l.add_value("Well_Type", row["WELL_CODE_DESC"])

        l.add_value("Unconventional", row["UNCONVENTIONAL"])
        l.add_value("Region", row["REGION"])

        # l.add_value ('Well_API__', '37-%s-00-00' % row['PERMIT_NUMBER'])
        l.add_value("Well_API__", "37-%s-00-00" % row["API"])

        item = l.load_item()
        if item["Well_API__"] and item["SPUD_Date"]:
            stats = self.crawler.stats
            existing_item = self.db.loadItem(item, {"Well_API__": item["Well_API__"], "SPUD_Date": item["SPUD_Date"]})

            if existing_item:
                stats.inc_value("_existing_count", spider=self)
            else:
                stats.inc_value("_new_count", spider=self)
                yield item

                params = dict(item)
                for f in item.fields:
                    params[f] = escape("%s" % params.get(f, ""))

                if task.get("no_alert"):
                    pass
                else:
                    # create a new feed item
                    l = ItemLoader(FeedEntry())

                    url = "%s/%s/%s" % (task["target_url"], item["Well_API__"], item["SPUD_Date"])
                    # feed_entry_id = uuid.uuid3(uuid.NAMESPACE_URL, url.encode('ASCII'))
                    feed_entry_id = self.db.uuid3_str(name=url.encode("ASCII"))
                    l.add_value("id", feed_entry_id)
                    l.add_value(
                        "title",
                        "%s Reports Drilling Started (SPUD) in %s Township"
                        % (item.get("Operator_s_Name"), item.get("Municipality")),
                    )
                    l.add_value("incident_datetime", item.get("SPUD_Date"))
                    l.add_value("link", task["about_url"])

                    l.add_value("summary", self.summary_template().substitute(params))
                    l.add_value("content", self.content_template().substitute(params))

                    l.add_value("lat", item.get("Latitude"))
                    l.add_value("lng", item.get("Longitude"))
                    l.add_value("source_id", 5)

                    feed_item = l.load_item()

                    if feed_item.get("lat") and feed_item.get("lng"):
                        yield feed_item

                        yield self.create_tag(feed_entry_id, "PADEP")
                        yield self.create_tag(feed_entry_id, "frack")
                        yield self.create_tag(feed_entry_id, "spud")
                        yield self.create_tag(feed_entry_id, "drilling")
                        well_type = item.get("Well_Type")
                        if well_type:
                            yield self.create_tag(feed_entry_id, well_type.lower())
                        if item.get("Unconventional") == "Yes":
                            yield self.create_tag(feed_entry_id, "unconventional")
Example #4
0
    def process_row(self, row, task):

        l = ItemLoader(PA_DrillingPermit())
        l.Well_Type_in = lambda slist: [s[:20] for s in slist]

        #l.add_value ('County_Name', row['COUNTY_NAME'])
        l.add_value('County_Name', row['COUNTY'])
        #l.add_value ('Municipality_Name', row['MUNICIPALITY_NAME'])
        l.add_value('Municipality_Name', row['MUNICIPALITY'])
        l.add_value('Auth_Id', row['AUTHORIZATION_ID'])
        l.add_value('Date_Disposed',
                    self.parse_date(row['PERMIT_ISSUED_DATE']))
        l.add_value('Appl_Type_Code', row['APPLICATION_TYPE'])
        l.add_value('Auth_Type_Description', row['AUTH_TYPE_DESCRIPTION'])
        l.add_value('Complete_API_', row['WELL_API'])
        l.add_value('Other_Id', self.base_api(row['WELL_API']))
        #        l.add_value ('Marcellus_Shale_Well', row['MARCELLUS_SHALE_IND'])
        #l.add_value ('Horizontal_Well', row['HORIZONTAL_WELL_IND'])
        if row['CONFIGURATION'] in ("Horizontal Well", "Deviated Well"):
            horiz = 'Y'
        else:
            horiz = 'N'
            if row['CONFIGURATION'] not in ("Vertical Well", ):
                self.log(
                    "Unknown PA Configuration: {0}.".format(
                        row['CONFIGURATION']), log.INFO)
        l.add_value('Horizontal_Well', horiz)
        l.add_value('Well_Type', row['WELL_TYPE'])
        l.add_value('Site_Name', row['FARM_NAME'])
        l.add_value('Latitude_Decimal', row['LATITUDE_DECIMAL'])
        l.add_value('Longitude_Decimal', row['LONGITUDE_DECIMAL'])
        l.add_value('Client_Id', row['CLIENT_ID'])
        l.add_value('Operator', row['OPERATOR'])
        l.add_value('Address1', row['OPERATOR_ADDRESS'])
        l.add_value('City', row['CITY'])
        l.add_value('State_Code', row['STATE'])
        l.add_value('Zip_Code', row['ZIP_CODE'])

        l.add_value('Unconventional', row['UNCONVENTIONAL'])
        l.add_value('OGO_Num', row['OGO_NUM'])
        #l.add_value ('Facility_Id', row['PRIMARY_FACILITY_ID'])
        l.add_value('Facility_Id', row['PRMRY_FAC_ID'])

        item = l.load_item()
        if item['Complete_API_'] and item['Date_Disposed']:
            stats = self.crawler.stats
            existing_item = self.db.loadItem(
                item, {
                    'Complete_API_': item['Complete_API_'],
                    'Date_Disposed': item['Date_Disposed']
                })

            if existing_item:
                #                diff = item.contentDiff (existing_item)

                #                if diff:
                #                    self.send_alert ('PA Permit values in %s have changed since previous scrape\n\n%s' % (item, diff))
                #                    self.log ('PA Permit values in %s have changed since previous scrape\n\n%s' % (item, diff), log.ERROR)
                #                    stats.inc_value ('_error_count', spider=self)
                #                else:
                #                    self.log('Skipping existing item %s' % (item), log.DEBUG)
                #                    stats.inc_value ('_unchanged_count', spider=self)

                stats.inc_value('_existing_count', spider=self)
            else:
                stats.inc_value('_new_count', spider=self)
                yield item

                params = dict(item)
                for f in item.fields:
                    params[f] = escape("%s" % params.get(f, ''))

                params['Appl_Type_Code'] = self.get_appl_type(item)
                params['Well_Type'] = self.get_well_type(item)

                # create a new feed item
                l = ItemLoader(FeedEntry())

                url = "%s/%s/%s" % (task['target_url'], item['Complete_API_'],
                                    item['Date_Disposed'])
                #feed_entry_id = uuid.uuid3(uuid.NAMESPACE_URL, url.encode('ASCII'))
                feed_entry_id = self.db.uuid3_str(name=url.encode('ASCII'))
                l.add_value('id', feed_entry_id)
                l.add_value(
                    'title', "PA %s Drilling Permit Issued in %s Township" %
                    (params.get('Well_Type'), item.get('Municipality_Name')))
                #                l.add_value ('updated', item.get('Date_Disposed'))
                l.add_value('incident_datetime', item.get('Date_Disposed'))
                l.add_value('link', task['about_url'])

                l.add_value('summary',
                            self.summary_template().substitute(params))
                l.add_value('content',
                            self.content_template().substitute(params))

                l.add_value('lat', item.get('Latitude_Decimal'))
                l.add_value('lng', item.get('Longitude_Decimal'))
                l.add_value('source_id', 4)

                feed_item = l.load_item()

                if feed_item.get('lat') and feed_item.get('lng'):
                    yield feed_item

                    yield self.create_tag(feed_entry_id, 'PADEP')
                    yield self.create_tag(feed_entry_id, 'frack')
                    yield self.create_tag(feed_entry_id, 'permit')
                    yield self.create_tag(feed_entry_id, 'drilling')
                    if item.get('Marcellus_Shale_Well') == 'Y':
                        yield self.create_tag(feed_entry_id, 'marcellus')
                    well_type = params.get('Well_Type')
                    if well_type:
                        yield self.create_tag(feed_entry_id, well_type)
Example #5
0
    def process_row(self, row, task):

        l = ItemLoader(PA_Spud())
        l.County_in = lambda slist: [s[:20] for s in slist]
        l.Municipality_in = lambda slist: [s[:20] for s in slist]
        l.Created_By_in = lambda slist: [s[:20] for s in slist]
        l.Modified_By_in = lambda slist: [s[:20] for s in slist]
        l.Well_Type_in = lambda slist: [s[:20] for s in slist]

        #l.add_value ('OGO__', row['OPERATOR_OGO_NUM'])
        l.add_value('OGO__', row['OGO_NUM'])
        l.add_value('SPUD_Date', self.parse_date(row['SPUD_DATE']))
        l.add_value('County', row['COUNTY'])
        l.add_value('Municipality', row['MUNICIPALITY'])
        l.add_value('Operator_s_Name', row['OPERATOR'])
        l.add_value('Farm_Name', row['FARM_NAME'])
        #l.add_value ('Well_Number', row['WELL_NUM'])
        l.add_value('Well_Number', '')  # Now included in FARM_NAME
        l.add_value('Latitude', row['LATITUDE'])
        l.add_value('Longitude', row['LONGITUDE'])
        #        l.add_value ('Marcellus_Ind_', row['MARCELLUS_IND'])
        #l.add_value ('Horizontal_Ind_', row['HORIZONTAL_WELL_IND'])
        if row['CONFIGURATION'] in ("Horizontal Well", "Deviated Well"):
            horiz = 'Y'
        else:
            horiz = 'N'
            if row['CONFIGURATION'] not in ("Vertical Well", ):
                self.log(
                    "Unknown PA Configuration: {0}.".format(
                        row['CONFIGURATION']), log.INFO)
        l.add_value('Horizontal_Ind_', horiz)
        #l.add_value ('Creation_Date', self.parse_date(row['CREATED_DATE']))
        #l.add_value ('Created_By', row['CREATED_BY'])
        #l.add_value ('Modification_Date', self.parse_date(row['MODIFIED_DATE']))
        #l.add_value ('Modified_By', row['MODIFIED_BY'])

        #l.add_value ('Well_Type', row['WELL_TYPE'])
        l.add_value('Well_Type', row['WELL_CODE_DESC'])

        l.add_value('Unconventional', row['UNCONVENTIONAL'])
        l.add_value('Region', row['REGION'])

        #l.add_value ('Well_API__', '37-%s-00-00' % row['PERMIT_NUMBER'])
        l.add_value('Well_API__', '37-%s-00-00' % row['API'])

        item = l.load_item()
        if item['Well_API__'] and item['SPUD_Date']:
            stats = self.crawler.stats
            existing_item = self.db.loadItem(item, {
                'Well_API__': item['Well_API__'],
                'SPUD_Date': item['SPUD_Date']
            })

            if existing_item:
                stats.inc_value('_existing_count', spider=self)
            else:
                stats.inc_value('_new_count', spider=self)
                yield item

                params = dict(item)
                for f in item.fields:
                    params[f] = escape("%s" % params.get(f, ''))

                if task.get('no_alert'):
                    pass
                else:
                    # create a new feed item
                    l = ItemLoader(FeedEntry())

                    url = "%s/%s/%s" % (task['target_url'], item['Well_API__'],
                                        item['SPUD_Date'])
                    #feed_entry_id = uuid.uuid3(uuid.NAMESPACE_URL, url.encode('ASCII'))
                    feed_entry_id = self.db.uuid3_str(name=url.encode('ASCII'))
                    l.add_value('id', feed_entry_id)
                    l.add_value(
                        'title',
                        "%s Reports Drilling Started (SPUD) in %s Township" %
                        (item.get('Operator_s_Name'),
                         item.get('Municipality')))
                    l.add_value('incident_datetime', item.get('SPUD_Date'))
                    l.add_value('link', task['about_url'])

                    l.add_value('summary',
                                self.summary_template().substitute(params))
                    l.add_value('content',
                                self.content_template().substitute(params))

                    l.add_value('lat', item.get('Latitude'))
                    l.add_value('lng', item.get('Longitude'))
                    l.add_value('source_id', 5)

                    feed_item = l.load_item()

                    if feed_item.get('lat') and feed_item.get('lng'):
                        yield feed_item

                        yield self.create_tag(feed_entry_id, 'PADEP')
                        yield self.create_tag(feed_entry_id, 'frack')
                        yield self.create_tag(feed_entry_id, 'spud')
                        yield self.create_tag(feed_entry_id, 'drilling')
                        well_type = item.get('Well_Type')
                        if well_type:
                            yield self.create_tag(feed_entry_id,
                                                  well_type.lower())
                        if item.get('Unconventional') == 'Yes':
                            yield self.create_tag(feed_entry_id,
                                                  'unconventional')