class QuotesSpider(scrapy.Spider): name = "suspects" sesh, Suspect, Leaver = load_tables() lvr = sesh.query(Leaver).filter_by(status='Lost', updated='No').order_by( Leaver.timestamp).limit(5).all() slinks = sesh.query(Suspect).all() link_list = [] for s in slinks: link_list.append(s.link) def start_requests(self, sesh=sesh, Leaver=Leaver, lvr=lvr): print('number of names to be scraped:', len(lvr)) if len(lvr) > 0: for l in lvr: print('Leaver Selected: ', l.name) lid = l.id string = str('https://www.google.com/search?q=' + l.name + ' ' + 'site:www.linkedin.com/in/' + ' ' + 'language:en') url = string l.timestamp = datetime.datetime.now( datetime.timezone.utc).isoformat() sesh.commit() yield scrapy.Request(url=url, callback=self.parse, meta={ 'lid': l.id, 'name': l.name }) else: raise CloseSpider('All Leavers Have Suspects') def parse(self, response): db_name = response.meta['name'] for i in response.xpath("//div[contains(@class, 'g')]"): print('**** G CLASS ****', i) raw_lnk = str(i.xpath(".//cite").extract()) clink = zone2(raw_lnk) print('Testing New Zone2: ', clink) if 'https://www.linkedin.com/in/' in clink: h3a = i.xpath(".//h3/a").extract() name, role1, firm1 = zone1(h3a) slp_xtract = i.xpath( ".//div[contains(@class, 'slp')]/descendant::text()" ).extract() print('Raw SLP Xtract: ', slp_xtract) print('LENGTH of SLP Xtract: ', len(slp_xtract)) if len(slp_xtract) > 0: txt = str(slp_xtract) print('length of slp: ', len(txt)) print('slp class detected. Running Zone3a Analysis...') city, role, firm = zone3a(txt) print('results from zone3a analysis: ') item = TrackItem() item['name'] = name item['link'] = clink item['ident'] = response.meta['lid'] item['location'] = city if role1 == None: item['role'] = role else: item['role'] = role1 if firm1 == None: item['firm'] = firm else: item['firm'] = firm1 score = score_name(item['name'], db_name) if score > 80: item['status'] = 'Success' yield item else: yield None else: print('no slp class found. salvaging text') st_class = i.xpath( ".//span[contains(@class, 'st')]/descendant::text()" ).extract() print('ST Text Extracted: ', st_class) salvage_string = list2string(st_class) print('st class converted to string: ', salvage_string) cleaned_str = clean_string(salvage_string, name) cleaned_str = cleaned_str.strip() print('st string filtered: ', cleaned_str) item = TrackItem() item['name'] = name item['link'] = clink item['location'] = None item['ident'] = response.meta['lid'] if role1 == None: item['role'] = None else: item['role'] = role1 if firm1 == None: if len(cleaned_str) > 100: print( ">>Cleaned string too long for db. Reducing to: ", cleaned_str[:99]) item['firm'] = cleaned_str[:99] else: item['firm'] = cleaned_str else: item['firm'] = firm1 score = score_name(item['name'], db_name) if score > 80: item['status'] = 'Success' yield item else: yield None
class QuotesSpider(scrapy.Spider): name = "suspects" sesh, Suspect, Leaver = load_tables() in_lvr = sesh.query(Leaver).filter_by( result='Lost', inprosshell='Yes').order_by(Leaver.suspectcheck).limit(5).all() out_lvr = sesh.query(Leaver).filter_by(result='Lost').order_by( Leaver.suspectcheck).limit(5).all() slinks = sesh.query(Suspect).all() link_list = [] for s in slinks: link_list.append(s.slink) if len(in_lvr) > 0: lvr = in_lvr else: lvr = out_lvr def start_requests(self, sesh=sesh, Leaver=Leaver, lvr=lvr): print('number of names to be scraped:', len(lvr)) if len(lvr) > 0: for l in lvr: print('Leaver Selected: ', l.name) lid = l.id try: old_firm_full = l.prosfirm old_firm_list = old_firm_full.split() oldfirm = old_firm_list[0] string = str('https://www.google.com/search?q=' + l.name + ' ' + oldfirm + ' ' + 'site:www.linkedin.com/in/') url = string l.suspectcheck = datetime.datetime.now( datetime.timezone.utc).isoformat() sesh.commit() yield scrapy.Request(url=url, callback=self.parse, meta={ 'lid': l.id, 'name': l.name }) except: string = str('https://www.google.com/search?q=' + l.name + ' ' + 'site:www.linkedin.com/in/') url = string l.suspectcheck = datetime.datetime.now( datetime.timezone.utc).isoformat() sesh.commit() yield scrapy.Request(url=url, callback=self.parse, meta={ 'lid': l.id, 'name': l.name }) else: raise CloseSpider('All Leavers Have Suspects') def parse(self, response): db_name = response.meta['name'] print('***') print('***') print('***') print('Parsing: ', db_name) for i in response.xpath("//div[@class='g']"): raw_lnk = str(i.xpath(".//cite").extract()) clink = zone2(raw_lnk) if 'https://www.linkedin.com/in/' in clink: h3a = i.xpath(".//h3/a").extract() name, role1, firm1 = zone1(h3a) name_test = score_name(name, db_name) if name_test > 65: print('Passing Sore: ', name_test) slp_xtract = i.xpath( ".//div[contains(@class, 'slp')]/descendant::text()" ).extract() print('Raw SLP Xtract: ', slp_xtract) print('LENGTH of SLP Xtract: ', len(slp_xtract)) if len(slp_xtract) > 0: txt = str(slp_xtract) print('length of slp: ', len(txt)) print('slp class detected. Running Zone3a Analysis...') city, role, firm = zone3a(txt) print('results from zone3a analysis: ') item = S3SuspectsItem() item['name'] = name item['link'] = clink item['ident'] = response.meta['lid'] item['location'] = city if role1 == None: item['role'] = role else: item['role'] = role1 if firm1 == None: item['firm'] = firm else: item['firm'] = firm1 yield item else: print('no slp class found. salvaging text') st_class = i.xpath( ".//span[contains(@class, 'st')]/descendant::text()" ).extract() print('ST Text Extracted: ', st_class) salvage_string = list2string(st_class) print('st class converted to string: ', salvage_string) cleaned_str = clean_string(salvage_string, name) cleaned_str = cleaned_str.strip() print('st string filtered: ', cleaned_str) item = S3SuspectsItem() item['name'] = name item['link'] = clink item['location'] = None item['ident'] = response.meta['lid'] if role1 == None: item['role'] = None else: item['role'] = role1 if firm1 == None: if len(cleaned_str) > 100: print( ">>Cleaned string too long for db. Reducing to: ", cleaned_str[:98]) item['firm'] = cleaned_str[:98] else: item['firm'] = cleaned_str else: item['firm'] = firm1 yield item else: print('Failing Score: ', name_test) yield None
class QuotesSpider(scrapy.Spider): name = "tracking" sesh, Suspect, Leaver = load_tables() fresh_lvr = sesh.query(Leaver).filter_by( status='Tracking', track_lst_update=None).limit(5).all() lvr = sesh.query(Leaver).filter_by(status='Tracking').order_by( Leaver.track_lst_update).limit(5).all() def start_requests(self, sesh=sesh, Leaver=Leaver, lvr=lvr, fresh_lvr=fresh_lvr): print('***** Number of Fresh Leavers Not Yet Tracked: ', len(fresh_lvr)) if len(fresh_lvr) > 0: for l in fresh_lvr: lid = l.id url = 'https://www.google.com/search?q=' + l.llink + ' ' + 'filter=0' yield scrapy.Request(url=url, callback=self.parse, meta={ 'lid': l.id, 'name': l.name }) else: for l in lvr: lid = l.id url = 'https://www.google.com/search?q=' + l.llink + ' ' + 'filter=0' yield scrapy.Request(url=url, callback=self.parse, meta={ 'lid': l.id, 'name': l.name }) def parse(self, response): db_name = response.meta['name'] for i in response.xpath("//div[@class='g']"): print('**** FIRST G CLASS ****', i) raw_lnk = str(i.xpath(".//cite").extract()) clink = zone2(raw_lnk) print('Zone2 Result Link: ', clink) if 'https://www.linkedin.com/in/' in clink: h3a = i.xpath(".//h3/a").extract() name, role1, firm1 = zone1(h3a) slp_xtract = i.xpath( ".//div[contains(@class, 'slp')]/descendant::text()" ).extract() print('Raw SLP Xtract: ', slp_xtract) print('LENGTH of SLP Xtract: ', len(slp_xtract)) if len(slp_xtract) > 0: txt = str(slp_xtract) print('length of slp: ', len(txt)) print('slp class detected. Running Zone3a Analysis...') city, role, firm = zone3a(txt) print('results from zone3a analysis: ') item = TrackItem() item['name'] = name item['link'] = clink item['ident'] = response.meta['lid'] item['location'] = city if role1 == None: item['role'] = role else: item['role'] = role1 if firm1 == None: item['firm'] = firm else: item['firm'] = firm1 score = score_name(item['name'], db_name) if score > 80: yield item else: yield None else: print('no slp class found. salvaging text') st_class = i.xpath( ".//span[contains(@class, 'st')]/descendant::text()" ).extract() print('ST Text Extracted: ', st_class) salvage_string = list2string(st_class) print('st class converted to string: ', salvage_string) cleaned_str = clean_string(salvage_string, name) print('st string filtered: ', cleaned_str) item = TrackItem() item['name'] = name item['link'] = clink item['location'] = None item['ident'] = response.meta['lid'] if role1 == None: item['role'] = None else: item['role'] = role1 if firm1 == None: if len(cleaned_str) > 100: print( ">>Cleaned string too long for db. Reducing to: ", cleaned_str[:99]) item['firm'] = cleaned_str[:99] else: item['firm'] = cleaned_str else: item['firm'] = firm1 score = score_name(item['name'], db_name) if score > 80: yield item else: yield None
class QuotesSpider(scrapy.Spider): name = "tracking" sesh, Suspect, Leaver = load_tables() fresh_lvr = sesh.query(Leaver).filter_by(result='Tracking', inprosshell='Yes', lasttracked=None).limit(5).all() lvr = sesh.query(Leaver).filter_by(result='Tracking').order_by( Leaver.lasttracked).limit(5).all() print('------> Number of First Time Tracks: ', len(fresh_lvr)) print('------> Number of Re-Tracks: ', len(lvr)) def start_requests(self, sesh=sesh, Leaver=Leaver, lvr=lvr, fresh_lvr=fresh_lvr): print('*********** Leavers To Be Tracked **********') if len(fresh_lvr) > 0: for l in fresh_lvr: print(l.name) lid = l.id try: old_firm_full = l.prosfirm old_firm_list = old_firm_full.split() oldfirm = old_firm_list[0] string = str('https://www.google.com/search?q=' + l.name + ' ' + oldfirm + ' ' + 'site:www.linkedin.com/in/') url = string l.lasttracked = datetime.datetime.now( datetime.timezone.utc).isoformat() sesh.commit() yield scrapy.Request(url=url, callback=self.parse, meta={ 'lid': l.id, 'name': l.name, 'truelink': l.link }) except: string = str('https://www.google.com/search?q=' + l.name + ' ' + 'site:www.linkedin.com/in/') url = string l.lasttracked = datetime.datetime.now( datetime.timezone.utc).isoformat() sesh.commit() yield scrapy.Request(url=url, callback=self.parse, meta={ 'lid': l.id, 'name': l.name, 'truelink': l.link }) else: for l in lvr: print(l.name) lid = l.id try: old_firm_full = l.prosfirm old_firm_list = old_firm_full.split() oldfirm = old_firm_list[0] string = str('https://www.google.com/search?q=' + l.name + ' ' + oldfirm + ' ' + 'site:www.linkedin.com/in/') url = string l.lasttracked = datetime.datetime.now( datetime.timezone.utc).isoformat() sesh.commit() yield scrapy.Request(url=url, callback=self.parse, meta={ 'lid': l.id, 'name': l.name, 'truelink': l.link }) except: string = str('https://www.google.com/search?q=' + l.name + ' ' + 'site:www.linkedin.com/in/') url = string l.lasttracked = datetime.datetime.now( datetime.timezone.utc).isoformat() sesh.commit() yield scrapy.Request(url=url, callback=self.parse, meta={ 'lid': l.id, 'name': l.name, 'truelink': l.link }) def parse(self, response): db_name = response.meta['name'] truelink = response.meta['truelink'] print('***') print('***') print('***') print('Parsing: ', db_name) for i in response.xpath("//div[@class='g']"): raw_lnk = str(i.xpath(".//cite").extract()) clink = zone2(raw_lnk) if 'https://www.linkedin.com/in/' in clink and clink == truelink: print('Links Matched. Proceeding...') print('DB Link: ', truelink) print('Scraped Link: ', clink) h3a = i.xpath(".//h3/a").extract() name, role1, firm1 = zone1(h3a) name_test = score_name(name, db_name) if name_test > 80: print('Passing Sore: ', name_test) slp_xtract = i.xpath( ".//div[contains(@class, 'slp')]/descendant::text()" ).extract() print('Raw SLP Xtract: ', slp_xtract) print('LENGTH of SLP Xtract: ', len(slp_xtract)) if len(slp_xtract) > 0: txt = str(slp_xtract) print('length of slp: ', len(txt)) print('slp class detected. Running Zone3a Analysis...') city, role, firm = zone3a(txt) print('results from zone3a analysis: ') item = S3TrackingItem() item['name'] = name item['link'] = clink item['ident'] = response.meta['lid'] item['location'] = city if role1 == None: item['role'] = role else: item['role'] = role1 if firm1 == None: item['firm'] = firm else: item['firm'] = firm1 yield item else: print('no slp class found. salvaging text') st_class = i.xpath( ".//span[contains(@class, 'st')]/descendant::text()" ).extract() print('ST Text Extracted: ', st_class) salvage_string = list2string(st_class) cleaned_str = clean_string(salvage_string, name) item = S3TrackingItem() item['name'] = name item['link'] = clink item['location'] = None item['ident'] = response.meta['lid'] if role1 == None: item['role'] = None else: item['role'] = role1 if firm1 == None: salvage_text = cleaned_str.strip() print('length of salvaged text: ', len(salvage_text)) if len(salvage_text) < 100: item['firm'] = salvage_text else: try: item['firm'] = salvage_text[:98] except: item['firm'] = None else: item['firm'] = firm1 yield item else: print('Failing Score: ', name_test) yield None else: print("Links Don't Match: ") print("DB Link: ", truelink) print('Scraped Link: ', clink) yield None
class QuotesSpider(scrapy.Spider): name = "testing" sesh, Suspect, Leaver = load_tables() lvr = sesh.query(Leaver).filter_by(status='Lost', updated='No').order_by(Leaver.timestamp).limit(5).all() slinks = sesh.query(Suspect).all() link_list = [] for s in slinks: link_list.append(s.link) def start_requests(self, sesh=sesh, Leaver=Leaver, lvr=lvr): test_name = 'Michael Gefen' test_id = 10 url = 'https://www.google.com/search?q=' + test_name + ' ' + 'site:www.linkedin.com' yield scrapy.Request(url=url, callback=self.parse, meta={'lid': test_id}) def parse(self, response): #for i in response.xpath('//*[@id="ires"]/ol/div[@class="g"]'): for i in response.xpath('//*[@id="ires"]/ol/div[@class="g"]'): #for i in response.xpath('//*[@id="ires"]/ol/div[@class="g"]'): item = TrackItem() print('*********************************RESPONSE: ') print(i.extract()) link_string = str(i.xpath('div/div[1]/cite').extract()) print('link_string: ', link_string) stage_link = remove_html_markup(link_string).strip('[').strip(']').strip("\'") print('stage_link: ', stage_link) name_placeholder = i.xpath('h3/a/b/text()').extract() name_place = i.xpath('h3/a/text()').extract() print('length of HTML TEST: ', len(name_place)) for k in name_place: print('pre-HTML content: ', k) np_test = remove_html_markup(name_place) print('HTML MarkUp Test: ', np_test) for j in name_placeholder: print('name_placeholder: ', j) item['name'] = name_placeholder[0].strip('[').strip(']') print("item['name']", item['name']) item['ident'] = response.meta['lid'] if 'https://www.linkedin.com/pub/dir/' in stage_link or 'site:www.linkedin.com' in name_placeholder[0]: pass else: item['link'] = stage_link deet = i.xpath('div/div[2]/text()').extract() if len(deet) == 1: deets = deet[0].replace(u'\xa0-\xa0', u'-') deet_lst = deets.split('-') print('deet_lst length: ', len(deet_lst)) print('DEET LIST VALUE: ', deet_lst[1]) #print('!!!!!!!!!!', len(deet_lst)) if len(deet_lst) == 3: try: item['location'] = deet_lst[0] except: item['location'] = None try: item['role'] = deet_lst[1] except: item['role'] = None try: item['firm'] = deet_lst[2] except: item['firm'] = None else: item['location'] = None item['role'] = None item['firm'] = None #xtrct = str(i.xpath('h3/a/@href').extract()) #item['link'] = re.search('q=(.*)&(amp;)?sa', xtrct) #text = response.xpath('/a').extract() #item['details'] = re.sub('<[^<]+?>', '', text) #item['details'] = response.xpath('/a').extract() yield item