Beispiel #1
0
class QuotesSpider(scrapy.Spider):
    name = "suspects"
    sesh, Suspect, Leaver = load_tables()
    lvr = sesh.query(Leaver).filter_by(status='Lost', updated='No').order_by(
        Leaver.timestamp).limit(5).all()
    slinks = sesh.query(Suspect).all()
    link_list = []
    for s in slinks:
        link_list.append(s.link)

    def start_requests(self, sesh=sesh, Leaver=Leaver, lvr=lvr):
        print('number of names to be scraped:', len(lvr))
        if len(lvr) > 0:
            for l in lvr:
                print('Leaver Selected: ', l.name)
                lid = l.id
                string = str('https://www.google.com/search?q=' + l.name +
                             ' ' + 'site:www.linkedin.com/in/' + ' ' +
                             'language:en')
                url = string
                l.timestamp = datetime.datetime.now(
                    datetime.timezone.utc).isoformat()
                sesh.commit()

                yield scrapy.Request(url=url,
                                     callback=self.parse,
                                     meta={
                                         'lid': l.id,
                                         'name': l.name
                                     })
        else:
            raise CloseSpider('All Leavers Have Suspects')

    def parse(self, response):
        db_name = response.meta['name']
        for i in response.xpath("//div[contains(@class, 'g')]"):
            print('**** G CLASS ****', i)
            raw_lnk = str(i.xpath(".//cite").extract())
            clink = zone2(raw_lnk)
            print('Testing New Zone2: ', clink)
            if 'https://www.linkedin.com/in/' in clink:
                h3a = i.xpath(".//h3/a").extract()
                name, role1, firm1 = zone1(h3a)
                slp_xtract = i.xpath(
                    ".//div[contains(@class, 'slp')]/descendant::text()"
                ).extract()
                print('Raw SLP Xtract: ', slp_xtract)
                print('LENGTH of SLP Xtract: ', len(slp_xtract))

                if len(slp_xtract) > 0:
                    txt = str(slp_xtract)
                    print('length of slp: ', len(txt))
                    print('slp class detected. Running Zone3a Analysis...')
                    city, role, firm = zone3a(txt)
                    print('results from zone3a analysis: ')
                    item = TrackItem()
                    item['name'] = name
                    item['link'] = clink
                    item['ident'] = response.meta['lid']
                    item['location'] = city
                    if role1 == None:
                        item['role'] = role
                    else:
                        item['role'] = role1
                    if firm1 == None:
                        item['firm'] = firm
                    else:
                        item['firm'] = firm1
                    score = score_name(item['name'], db_name)
                    if score > 80:
                        item['status'] = 'Success'
                        yield item
                    else:
                        yield None

                else:
                    print('no slp class found.  salvaging text')
                    st_class = i.xpath(
                        ".//span[contains(@class, 'st')]/descendant::text()"
                    ).extract()
                    print('ST Text Extracted: ', st_class)
                    salvage_string = list2string(st_class)
                    print('st class converted to string: ', salvage_string)
                    cleaned_str = clean_string(salvage_string, name)
                    cleaned_str = cleaned_str.strip()
                    print('st string filtered: ', cleaned_str)
                    item = TrackItem()
                    item['name'] = name
                    item['link'] = clink
                    item['location'] = None
                    item['ident'] = response.meta['lid']
                    if role1 == None:
                        item['role'] = None
                    else:
                        item['role'] = role1
                    if firm1 == None:
                        if len(cleaned_str) > 100:
                            print(
                                ">>Cleaned string too long for db. Reducing to: ",
                                cleaned_str[:99])
                            item['firm'] = cleaned_str[:99]
                        else:
                            item['firm'] = cleaned_str
                    else:
                        item['firm'] = firm1
                    score = score_name(item['name'], db_name)
                    if score > 80:
                        item['status'] = 'Success'
                        yield item
                    else:
                        yield None
Beispiel #2
0
class QuotesSpider(scrapy.Spider):
    name = "suspects"
    sesh, Suspect, Leaver = load_tables()
    in_lvr = sesh.query(Leaver).filter_by(
        result='Lost',
        inprosshell='Yes').order_by(Leaver.suspectcheck).limit(5).all()
    out_lvr = sesh.query(Leaver).filter_by(result='Lost').order_by(
        Leaver.suspectcheck).limit(5).all()
    slinks = sesh.query(Suspect).all()
    link_list = []
    for s in slinks:
        link_list.append(s.slink)

    if len(in_lvr) > 0:
        lvr = in_lvr
    else:
        lvr = out_lvr

    def start_requests(self, sesh=sesh, Leaver=Leaver, lvr=lvr):
        print('number of names to be scraped:', len(lvr))
        if len(lvr) > 0:
            for l in lvr:
                print('Leaver Selected: ', l.name)
                lid = l.id
                try:
                    old_firm_full = l.prosfirm
                    old_firm_list = old_firm_full.split()
                    oldfirm = old_firm_list[0]
                    string = str('https://www.google.com/search?q=' + l.name +
                                 ' ' + oldfirm + ' ' +
                                 'site:www.linkedin.com/in/')
                    url = string
                    l.suspectcheck = datetime.datetime.now(
                        datetime.timezone.utc).isoformat()
                    sesh.commit()

                    yield scrapy.Request(url=url,
                                         callback=self.parse,
                                         meta={
                                             'lid': l.id,
                                             'name': l.name
                                         })

                except:
                    string = str('https://www.google.com/search?q=' + l.name +
                                 ' ' + 'site:www.linkedin.com/in/')
                    url = string
                    l.suspectcheck = datetime.datetime.now(
                        datetime.timezone.utc).isoformat()
                    sesh.commit()

                    yield scrapy.Request(url=url,
                                         callback=self.parse,
                                         meta={
                                             'lid': l.id,
                                             'name': l.name
                                         })
        else:
            raise CloseSpider('All Leavers Have Suspects')

    def parse(self, response):
        db_name = response.meta['name']
        print('***')
        print('***')
        print('***')
        print('Parsing: ', db_name)
        for i in response.xpath("//div[@class='g']"):
            raw_lnk = str(i.xpath(".//cite").extract())
            clink = zone2(raw_lnk)
            if 'https://www.linkedin.com/in/' in clink:
                h3a = i.xpath(".//h3/a").extract()
                name, role1, firm1 = zone1(h3a)
                name_test = score_name(name, db_name)
                if name_test > 65:
                    print('Passing Sore: ', name_test)
                    slp_xtract = i.xpath(
                        ".//div[contains(@class, 'slp')]/descendant::text()"
                    ).extract()
                    print('Raw SLP Xtract: ', slp_xtract)
                    print('LENGTH of SLP Xtract: ', len(slp_xtract))

                    if len(slp_xtract) > 0:
                        txt = str(slp_xtract)
                        print('length of slp: ', len(txt))
                        print('slp class detected. Running Zone3a Analysis...')
                        city, role, firm = zone3a(txt)
                        print('results from zone3a analysis: ')
                        item = S3SuspectsItem()
                        item['name'] = name
                        item['link'] = clink
                        item['ident'] = response.meta['lid']
                        item['location'] = city
                        if role1 == None:
                            item['role'] = role
                        else:
                            item['role'] = role1
                        if firm1 == None:
                            item['firm'] = firm
                        else:
                            item['firm'] = firm1

                        yield item

                    else:
                        print('no slp class found.  salvaging text')
                        st_class = i.xpath(
                            ".//span[contains(@class, 'st')]/descendant::text()"
                        ).extract()
                        print('ST Text Extracted: ', st_class)
                        salvage_string = list2string(st_class)
                        print('st class converted to string: ', salvage_string)
                        cleaned_str = clean_string(salvage_string, name)
                        cleaned_str = cleaned_str.strip()
                        print('st string filtered: ', cleaned_str)
                        item = S3SuspectsItem()
                        item['name'] = name
                        item['link'] = clink
                        item['location'] = None
                        item['ident'] = response.meta['lid']
                        if role1 == None:
                            item['role'] = None
                        else:
                            item['role'] = role1
                        if firm1 == None:
                            if len(cleaned_str) > 100:
                                print(
                                    ">>Cleaned string too long for db. Reducing to: ",
                                    cleaned_str[:98])
                                item['firm'] = cleaned_str[:98]
                            else:
                                item['firm'] = cleaned_str
                        else:
                            item['firm'] = firm1

                        yield item
                else:
                    print('Failing Score: ', name_test)
                    yield None
Beispiel #3
0
class QuotesSpider(scrapy.Spider):
    name = "tracking"
    sesh, Suspect, Leaver = load_tables()
    fresh_lvr = sesh.query(Leaver).filter_by(
        status='Tracking', track_lst_update=None).limit(5).all()
    lvr = sesh.query(Leaver).filter_by(status='Tracking').order_by(
        Leaver.track_lst_update).limit(5).all()

    def start_requests(self,
                       sesh=sesh,
                       Leaver=Leaver,
                       lvr=lvr,
                       fresh_lvr=fresh_lvr):
        print('***** Number of Fresh Leavers Not Yet Tracked: ',
              len(fresh_lvr))
        if len(fresh_lvr) > 0:
            for l in fresh_lvr:
                lid = l.id
                url = 'https://www.google.com/search?q=' + l.llink + ' ' + 'filter=0'

                yield scrapy.Request(url=url,
                                     callback=self.parse,
                                     meta={
                                         'lid': l.id,
                                         'name': l.name
                                     })
        else:
            for l in lvr:
                lid = l.id
                url = 'https://www.google.com/search?q=' + l.llink + ' ' + 'filter=0'

                yield scrapy.Request(url=url,
                                     callback=self.parse,
                                     meta={
                                         'lid': l.id,
                                         'name': l.name
                                     })

    def parse(self, response):
        db_name = response.meta['name']
        for i in response.xpath("//div[@class='g']"):
            print('**** FIRST G CLASS ****', i)
            raw_lnk = str(i.xpath(".//cite").extract())
            clink = zone2(raw_lnk)
            print('Zone2 Result Link: ', clink)
            if 'https://www.linkedin.com/in/' in clink:
                h3a = i.xpath(".//h3/a").extract()
                name, role1, firm1 = zone1(h3a)
                slp_xtract = i.xpath(
                    ".//div[contains(@class, 'slp')]/descendant::text()"
                ).extract()
                print('Raw SLP Xtract: ', slp_xtract)
                print('LENGTH of SLP Xtract: ', len(slp_xtract))

                if len(slp_xtract) > 0:
                    txt = str(slp_xtract)
                    print('length of slp: ', len(txt))
                    print('slp class detected. Running Zone3a Analysis...')
                    city, role, firm = zone3a(txt)
                    print('results from zone3a analysis: ')
                    item = TrackItem()
                    item['name'] = name
                    item['link'] = clink
                    item['ident'] = response.meta['lid']
                    item['location'] = city
                    if role1 == None:
                        item['role'] = role
                    else:
                        item['role'] = role1
                    if firm1 == None:
                        item['firm'] = firm
                    else:
                        item['firm'] = firm1
                    score = score_name(item['name'], db_name)
                    if score > 80:
                        yield item
                    else:
                        yield None

                else:
                    print('no slp class found.  salvaging text')
                    st_class = i.xpath(
                        ".//span[contains(@class, 'st')]/descendant::text()"
                    ).extract()
                    print('ST Text Extracted: ', st_class)
                    salvage_string = list2string(st_class)
                    print('st class converted to string: ', salvage_string)
                    cleaned_str = clean_string(salvage_string, name)
                    print('st string filtered: ', cleaned_str)
                    item = TrackItem()
                    item['name'] = name
                    item['link'] = clink
                    item['location'] = None
                    item['ident'] = response.meta['lid']
                    if role1 == None:
                        item['role'] = None
                    else:
                        item['role'] = role1
                    if firm1 == None:
                        if len(cleaned_str) > 100:
                            print(
                                ">>Cleaned string too long for db. Reducing to: ",
                                cleaned_str[:99])
                            item['firm'] = cleaned_str[:99]
                        else:
                            item['firm'] = cleaned_str
                    else:
                        item['firm'] = firm1
                    score = score_name(item['name'], db_name)
                    if score > 80:
                        yield item
                    else:
                        yield None
Beispiel #4
0
class QuotesSpider(scrapy.Spider):
    name = "tracking"
    sesh, Suspect, Leaver = load_tables()
    fresh_lvr = sesh.query(Leaver).filter_by(result='Tracking',
                                             inprosshell='Yes',
                                             lasttracked=None).limit(5).all()
    lvr = sesh.query(Leaver).filter_by(result='Tracking').order_by(
        Leaver.lasttracked).limit(5).all()
    print('------> Number of First Time Tracks: ', len(fresh_lvr))
    print('------> Number of Re-Tracks: ', len(lvr))

    def start_requests(self,
                       sesh=sesh,
                       Leaver=Leaver,
                       lvr=lvr,
                       fresh_lvr=fresh_lvr):
        print('*********** Leavers To Be Tracked **********')
        if len(fresh_lvr) > 0:
            for l in fresh_lvr:
                print(l.name)
                lid = l.id
                try:
                    old_firm_full = l.prosfirm
                    old_firm_list = old_firm_full.split()
                    oldfirm = old_firm_list[0]
                    string = str('https://www.google.com/search?q=' + l.name +
                                 ' ' + oldfirm + ' ' +
                                 'site:www.linkedin.com/in/')
                    url = string
                    l.lasttracked = datetime.datetime.now(
                        datetime.timezone.utc).isoformat()
                    sesh.commit()

                    yield scrapy.Request(url=url,
                                         callback=self.parse,
                                         meta={
                                             'lid': l.id,
                                             'name': l.name,
                                             'truelink': l.link
                                         })
                except:
                    string = str('https://www.google.com/search?q=' + l.name +
                                 ' ' + 'site:www.linkedin.com/in/')
                    url = string
                    l.lasttracked = datetime.datetime.now(
                        datetime.timezone.utc).isoformat()
                    sesh.commit()

                    yield scrapy.Request(url=url,
                                         callback=self.parse,
                                         meta={
                                             'lid': l.id,
                                             'name': l.name,
                                             'truelink': l.link
                                         })
        else:
            for l in lvr:
                print(l.name)
                lid = l.id
                try:
                    old_firm_full = l.prosfirm
                    old_firm_list = old_firm_full.split()
                    oldfirm = old_firm_list[0]
                    string = str('https://www.google.com/search?q=' + l.name +
                                 ' ' + oldfirm + ' ' +
                                 'site:www.linkedin.com/in/')
                    url = string
                    l.lasttracked = datetime.datetime.now(
                        datetime.timezone.utc).isoformat()
                    sesh.commit()

                    yield scrapy.Request(url=url,
                                         callback=self.parse,
                                         meta={
                                             'lid': l.id,
                                             'name': l.name,
                                             'truelink': l.link
                                         })
                except:
                    string = str('https://www.google.com/search?q=' + l.name +
                                 ' ' + 'site:www.linkedin.com/in/')
                    url = string
                    l.lasttracked = datetime.datetime.now(
                        datetime.timezone.utc).isoformat()
                    sesh.commit()

                    yield scrapy.Request(url=url,
                                         callback=self.parse,
                                         meta={
                                             'lid': l.id,
                                             'name': l.name,
                                             'truelink': l.link
                                         })

    def parse(self, response):
        db_name = response.meta['name']
        truelink = response.meta['truelink']
        print('***')
        print('***')
        print('***')
        print('Parsing: ', db_name)
        for i in response.xpath("//div[@class='g']"):
            raw_lnk = str(i.xpath(".//cite").extract())
            clink = zone2(raw_lnk)
            if 'https://www.linkedin.com/in/' in clink and clink == truelink:
                print('Links Matched. Proceeding...')
                print('DB Link: ', truelink)
                print('Scraped Link: ', clink)
                h3a = i.xpath(".//h3/a").extract()
                name, role1, firm1 = zone1(h3a)

                name_test = score_name(name, db_name)
                if name_test > 80:
                    print('Passing Sore: ', name_test)
                    slp_xtract = i.xpath(
                        ".//div[contains(@class, 'slp')]/descendant::text()"
                    ).extract()
                    print('Raw SLP Xtract: ', slp_xtract)
                    print('LENGTH of SLP Xtract: ', len(slp_xtract))

                    if len(slp_xtract) > 0:
                        txt = str(slp_xtract)
                        print('length of slp: ', len(txt))
                        print('slp class detected. Running Zone3a Analysis...')
                        city, role, firm = zone3a(txt)
                        print('results from zone3a analysis: ')
                        item = S3TrackingItem()
                        item['name'] = name
                        item['link'] = clink
                        item['ident'] = response.meta['lid']
                        item['location'] = city
                        if role1 == None:
                            item['role'] = role
                        else:
                            item['role'] = role1
                        if firm1 == None:
                            item['firm'] = firm
                        else:
                            item['firm'] = firm1

                        yield item

                    else:
                        print('no slp class found.  salvaging text')
                        st_class = i.xpath(
                            ".//span[contains(@class, 'st')]/descendant::text()"
                        ).extract()
                        print('ST Text Extracted: ', st_class)
                        salvage_string = list2string(st_class)
                        cleaned_str = clean_string(salvage_string, name)
                        item = S3TrackingItem()
                        item['name'] = name
                        item['link'] = clink
                        item['location'] = None
                        item['ident'] = response.meta['lid']
                        if role1 == None:
                            item['role'] = None
                        else:
                            item['role'] = role1
                        if firm1 == None:
                            salvage_text = cleaned_str.strip()
                            print('length of salvaged text: ',
                                  len(salvage_text))
                            if len(salvage_text) < 100:
                                item['firm'] = salvage_text
                            else:
                                try:
                                    item['firm'] = salvage_text[:98]
                                except:
                                    item['firm'] = None
                        else:
                            item['firm'] = firm1
                        yield item

                else:
                    print('Failing Score: ', name_test)
                    yield None
            else:
                print("Links Don't Match: ")
                print("DB Link: ", truelink)
                print('Scraped Link: ', clink)
                yield None
Beispiel #5
0
class QuotesSpider(scrapy.Spider):
    name = "testing"
    sesh, Suspect, Leaver = load_tables()
    lvr = sesh.query(Leaver).filter_by(status='Lost', updated='No').order_by(Leaver.timestamp).limit(5).all()


    slinks = sesh.query(Suspect).all()
    link_list = []
    for s in slinks:
        link_list.append(s.link)
    def start_requests(self, sesh=sesh, Leaver=Leaver, lvr=lvr):
        test_name = 'Michael Gefen'
        test_id = 10
        url = 'https://www.google.com/search?q=' + test_name + ' ' + 'site:www.linkedin.com'

        yield scrapy.Request(url=url, callback=self.parse, meta={'lid': test_id})

    def parse(self, response):

        #for i in response.xpath('//*[@id="ires"]/ol/div[@class="g"]'):
        for i in response.xpath('//*[@id="ires"]/ol/div[@class="g"]'):
        #for i in response.xpath('//*[@id="ires"]/ol/div[@class="g"]'):
            item = TrackItem()
            print('*********************************RESPONSE: ')
            print(i.extract())
            link_string = str(i.xpath('div/div[1]/cite').extract())
            print('link_string: ', link_string)
            stage_link = remove_html_markup(link_string).strip('[').strip(']').strip("\'")
            print('stage_link: ', stage_link)
            name_placeholder = i.xpath('h3/a/b/text()').extract()
            name_place = i.xpath('h3/a/text()').extract()
            print('length of HTML TEST: ', len(name_place))
            for k in name_place:
                print('pre-HTML content: ', k)
            np_test = remove_html_markup(name_place)
            print('HTML MarkUp Test: ', np_test)
            for j in name_placeholder:
                print('name_placeholder: ', j)
            item['name'] = name_placeholder[0].strip('[').strip(']')
            print("item['name']", item['name'])
            item['ident'] = response.meta['lid']
            if 'https://www.linkedin.com/pub/dir/' in stage_link or 'site:www.linkedin.com' in name_placeholder[0]:
                pass
            else:
                item['link'] = stage_link
                deet = i.xpath('div/div[2]/text()').extract()
                if len(deet) == 1:
                    deets = deet[0].replace(u'\xa0-\xa0', u'-')
                    deet_lst = deets.split('-')
                    print('deet_lst length: ', len(deet_lst))
                    print('DEET  LIST VALUE: ', deet_lst[1])
                    #print('!!!!!!!!!!', len(deet_lst))
                    if len(deet_lst) == 3:
                        try:
                            item['location'] = deet_lst[0]
                        except:
                            item['location'] = None
                        try:
                            item['role'] = deet_lst[1]
                        except:
                            item['role'] = None
                        try:
                            item['firm'] = deet_lst[2]
                        except:
                            item['firm'] = None
                    else:
                        item['location'] = None
                        item['role'] = None
                        item['firm'] = None


                #xtrct = str(i.xpath('h3/a/@href').extract())
                #item['link'] = re.search('q=(.*)&(amp;)?sa', xtrct)
                #text = response.xpath('/a').extract()
                #item['details'] = re.sub('<[^<]+?>', '', text)
                #item['details'] = response.xpath('/a').extract()
                yield item