def search_a_company(): try: with open('for_listing_websites_edu.csv', mode='w', encoding='utf8', newline='') as results_file: # store search results in to a csv file results_writer = csv.writer(results_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) attributes_a = ['search_text', 'title', 'link', 'description', 'rich_description', 'comp_name'] results_writer.writerow(attributes_a) f = open('not_found.txt','r') missed_list = [link.strip() for link in f.readlines()] # print(missed_list) for row in missed_list: # time.sleep(60) print(row) sr = getGoogleLinksForSearchText(row+" Australia", 5, 'initial') if (len(sr) == 0): sr = getGoogleLinksForSearchText(row+" Australia", 5, 'initial') if (len(sr) == 0): sr = getGoogleLinksForSearchText(row+" Australia", 5, 'initial') count = 0 while (sr == 'captcha'): count = count + 1 print('captch detected and sleeping for n times n:', count) time.sleep(1200 * count) sr = getGoogleLinksForSearchText(row+" Australia", 5, 'initial') print(sr[0]) results_writer.writerow([sr[0]['search_text'], sr[0]['title'], sr[0]['link'], sr[0]['description'],sr[0]['rich_description']]) results_file.close() except Exception as e: print("Error occured! try again",e) return 'error' # search_a_company()
def get_li_url(entry_id): mycol = refer_collection() comp_data_entry = mycol.find({"_id": entry_id}) data = [i for i in comp_data_entry] try: sm_links = data[0]['social_media_links'] except Exception: sm_links = [] linked_in_comp_urls = [] for each in sm_links: if('linkedin.com/company' in each):linked_in_comp_urls.append(each) if(len(linked_in_comp_urls)): print("Linkedin profile collected from crawled data") print("linkedin taken from crawling") return linked_in_comp_urls[0] else: comp_name = data[0]['comp_name'] print(data[0]['comp_name']) sr = getGoogleLinksForSearchText( comp_name + " linkedin australia", 5, 'normal') if (len(sr) == 0): sr = getGoogleLinksForSearchText(comp_name + " linkedin australia", 5, 'normal') if (len(sr) == 0): sr = getGoogleLinksForSearchText(comp_name + " linkedin australia", 5, 'normal') filtered_li = [] for p in sr: # print(p['link']) if 'linkedin.com/company' in p['link']: filtered_li.append(p['link']) if (len(filtered_li)): return filtered_li[0] else: print("No linkedin contacts found!, Try again") return False
def get_cp_oc(entry_id,mode): # myclient = pymongo.MongoClient("mongodb://localhost:27017/") # mydb = myclient["CompanyDatabase"] # refer the database # mycol = mydb["comp_data"] # refer the collection mycol = refer_collection() comp_data_entry = mycol.find({"_id": entry_id}) data = [i for i in comp_data_entry] # comp_name = data[0]['search_text'] try: if mode=='comp': comp_name = data[0]['search_text'] elif mode == 'query': comp_name = data[0]['comp_name'] except KeyError: comp_name = data[0]['link'].split("/")[2] det=[comp_name] sr = getGoogleLinksForSearchText(comp_name + " opencorporates", 3, 'normal') filtered_oc = [] for p in sr: if (('opencorporates.com/companies/nz' in p['link']) or ('opencorporates.com/companies/au' in p['link'])): filtered_oc.append([p['title'], p['link']]) if (len(filtered_oc)): print(filtered_oc[0]) det.append(filtered_oc[0]) det.append(scrape_opencorporates(filtered_oc[0][1])) print(det) mycol.update_one({'_id': entry_id}, {'$set': {'oc_cp_info': det}}) print("Successfully extended the data entry with opencorporates contact person data", entry_id) else: print("No opencorporates profile found!, Try again") mycol.update_one({'_id': entry_id}, {'$set': {'oc_cp_info': det}})
def search_a_company_alpha(comp_name, db_collection, query_entry, c_name): sr = getGoogleLinksForSearchText(comp_name, 3, 'normal') count = 0 while (sr == 'captcha'): count = count + 1 print('captch detected and sleeping for n times n:', count) time.sleep(1200 * count) sr = getGoogleLinksForSearchText(comp_name, 3, 'normal') b_list_file = open( three_up + '//Simplified_System//Initial_Crawling//black_list.txt', 'r') black_list = b_list_file.read().splitlines() # 'www.dnb.com' received_links = [i['link'] for i in sr] received_domains = [i.split("/")[2] for i in received_links] filtered_sr = [] print('rd', received_domains) for i, each in enumerate(received_domains): if each not in black_list: filtered_sr.append(sr[i]) if (len(filtered_sr)): res_data = is_profile_exist(filtered_sr[0]['link']) if (len(res_data)): print("Profile " + filtered_sr[0]['link'] + " already existing at " + str(res_data[0]['_id'])) return 'exist' filtered_sr[0]['comp_name'] = c_name filtered_sr[0]['query_id'] = query_entry record_entry = db_collection.insert_one(filtered_sr[0]) print(filtered_sr[0]) print("search record stored in db: ", record_entry.inserted_id) return record_entry.inserted_id else: print("No results found!") return None
def get_cp_dnb(entry_id,mode): mycol = refer_collection() comp_data_entry = mycol.find({"_id": entry_id}) # print(comp_data_entry) data = [i for i in comp_data_entry] # comp_name = data[0]['search_text'] # print(data) try: if mode=='comp': comp_name = data[0]['search_text'] elif mode == 'query': print(data) comp_name = data[0]['comp_name'] except KeyError: comp_name = data[0]['link'].split("/")[2] det = [comp_name] sr = getGoogleLinksForSearchText(comp_name + " dnb.com", 3, 'normal') filtered_dnb = [] for p in sr: if 'dnb.com/business-directory/company-profiles' in p['link']: filtered_dnb.append([p['title'], p['link']]) if (len(filtered_dnb)): print("dnb profile found and extracting contact persons..") print(filtered_dnb[0]) det.append(filtered_dnb[0]) print(filtered_dnb[0][1]) det.append(scrape_dnb(filtered_dnb[0][1])) print(det) mycol.update_one({'_id': entry_id}, {'$set': {'dnb_cp_info': det}}) print("Successfully extended the data entry with dnb contact person data", entry_id) else: print("No dnb profile found!,Try again..") print(det) mycol.update_one({'_id': entry_id}, {'$set': {'dnb_cp_info': det}})
def search_a_company(comp_name, db_collection, query_entry): try: sr = getGoogleLinksForSearchText(comp_name + " Australia", 5, 'normal') count = 0 while (sr == 'captcha'): count = count + 1 print('captch detected and sleeping for n times n:', count) time.sleep(1200 * count) sr = getGoogleLinksForSearchText(comp_name, 5, 'normal') b_list_file = open( three_up + '//Simplified_System//Initial_Crawling//black_list.txt', 'r') black_list = b_list_file.read().splitlines() # 'www.dnb.com' received_links = [i['link'] for i in sr] print(received_links) #filter the links filtered_sr_a = [] filtered_received_links = [] for i, each_l in enumerate(received_links): if (('.com/' in each_l) or ('.education/' in each_l) or ('.io/' in each_l) or ('.com.au/' in each_l) or ('.net/' in each_l) or ('.org/' in each_l) or ('.co.nz/' in each_l) or ('.nz/' in each_l) or ('.au/' in each_l) or ('.biz/' in each_l)): # print(each) filtered_received_links.append(each_l) filtered_sr_a.append(sr[i]) print(filtered_sr_a) received_domains = [i.split("/")[2] for i in filtered_received_links] filtered_sr = [] print('rd', received_domains) for i, each in enumerate(received_domains): # print(each) if (('.gov.' in each) or ('.govt.' in each) or ('.edu.' in each) or ('.uk' in each)): # filter non wanted websites continue if each not in black_list: filtered_sr.append(filtered_sr_a[i]) if (len(filtered_sr)): #is the link already taken res_data = is_profile_exist(filtered_sr[0]['link']) if (len(res_data)): print("Profile " + filtered_sr[0]['link'] + " already existing at " + str(res_data[0]['_id'])) return 'exist' #should fix comp name # print('fixing comp name') c_n_link = filtered_sr[0]['link'] c_n_dom = c_n_link.split("/")[2] try: c_name = c_n_dom.split("www.")[1] except IndexError: c_name = c_n_dom if ('.com' in c_name): cc_name = c_name.split(".com")[0] elif ('.org' in c_name): cc_name = c_name.split(".org")[0] elif ('.io' in c_name): cc_name = c_name.split(".io")[0] elif ('.net' in c_name): cc_name = c_name.split(".net")[0] else: cc_name = c_name # print(filtered_sr[0]['link']) filtered_sr[0]['comp_name'] = cc_name filtered_sr[0]['query_id'] = query_entry record_entry = db_collection.insert_one(filtered_sr[0]) print(filtered_sr[0]) print("search record stored in db: ", record_entry.inserted_id) return record_entry.inserted_id else: print("No results found!") return None except Exception as e: print("Error occured! try again", e) return 'error'
def search_a_query(search_query, number_of_results, db_collection, query_entry): try: sr = getGoogleLinksForSearchText(search_query, number_of_results, 'normal') count = 0 while (sr == 'captcha'): count = count + 1 print('captch detected and sleeping for n times n:', count) time.sleep(1200 * count) sr = getGoogleLinksForSearchText(search_query, number_of_results, 'normal') if (len(sr)): # print(sr) # record_entry = db_collection.insert_many(sr) for each_sr in sr: print(each_sr) received_links = [i['link'] for i in sr] filtered_received_links = [] for each_l in received_links: if (('.com/' in each_l) or ('.education/' in each_l) or ('.io/' in each_l) or ('.com.au/' in each_l) or ('.net/' in each_l) or ('.org/' in each_l) or ('.co.nz/' in each_l) or ('.nz/' in each_l) or ('.au/' in each_l) or ('.biz/' in each_l)): # print(each) filtered_received_links.append(each_l) received_domains = [ i.split("/")[2] for i in filtered_received_links ] print("received_domains", received_domains) received_domains = list(set(received_domains)) print("received_domains", received_domains) ids_list = [] for k in range(len(received_domains)): time.sleep(10) print(received_links[k], received_domains[k]) b_list_file = open( three_up + '//Simplified_System//Initial_Crawling//black_list.txt', 'r') black_list = b_list_file.read().splitlines() if (received_domains[k] in black_list): #filter non wanted websites continue if (('.gov.' in received_domains[k]) or ('.govt.' in received_domains[k]) or ('.edu.' in received_domains[k]) or ('.uk' in received_domains[k])): # filter non wanted websites continue sr = getGoogleLinksForSearchText(received_domains[k], 3, 'normal') if (len(sr) == 0): sr = getGoogleLinksForSearchText(received_domains[k], 3, 'normal') if (len(sr) == 0): sr = getGoogleLinksForSearchText( received_domains[k], 3, 'normal') if (len(sr) > 0): print(sr[0]) res_data = is_profile_exist(sr[0]['link']) if (len(res_data)): print("Profile " + sr[0]['link'] + " already existing at " + str(res_data[0]['_id'])) continue sr[0]['search_text'] = search_query try: c_name = received_domains[k].split("www.")[1] except IndexError: c_name = received_domains[k] if ('.com' in c_name): sr[0]['comp_name'] = c_name.split(".com")[0] elif ('.org' in c_name): sr[0]['comp_name'] = c_name.split(".org")[0] elif ('.co' in c_name): sr[0]['comp_name'] = c_name.split(".co")[0] elif ('.edu' in c_name): sr[0]['comp_name'] = c_name.split(".edu")[0] else: sr[0]['comp_name'] = c_name print(sr[0]) sr[0]['query_id'] = query_entry record_entry = db_collection.insert_one(sr[0]) print("search record stored in db: ", record_entry.inserted_id) ids_list.append(record_entry.inserted_id) else: print("Cannot find results, skipping company") print(ids_list) return ids_list # print("search records stored in db: ", record_entry.inserted_ids) else: print("No results found!") return None except Exception as e: print("Error occured! try again", e) return 'error' #store file to a csv file # with open('search_results.csv', mode='w',encoding='utf8') as results_file: # store search results in to a csv file # results_writer = csv.writer(results_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) # # for each_item in sr: # results_writer.writerow([each_item['title'], each_item['link'], each_item['description']]) # results_file.close() # mycol = refer_collection() # search_a_company('TOOHEYS PTY LIMITED',mycol) # search_a_company('CALTEX PETROLEUM PTY LTD',mycol) # search_a_query('Digital advertisement and marketing analytics services company',5,mycol)
def update_a_company(comp_name, db_collection, entry_id): print(entry_id) sr = getGoogleLinksForSearchText(comp_name, 50, 'normal') count = 0 while (sr == 'captcha'): count = count + 1 print('captch detected and sleeping for n times n:', count) time.sleep(1200 * count) sr = getGoogleLinksForSearchText(comp_name, 50, 'normal') b_list_file = open( three_up + '//Simplified_System//Initial_Crawling//black_list.txt', 'r') black_list = b_list_file.read().splitlines() # 'www.dnb.com' received_links = [i['link'] for i in sr] print(received_links) #filter the links filtered_received_links = [] filtered_sr_a = [] for i, each_l in enumerate(received_links): if (('.com/' in each_l) or ('.education/' in each_l) or ('.io/' in each_l) or ('.com.au/' in each_l) or ('.net/' in each_l) or ('.org/' in each_l) or ('.co.nz/' in each_l) or ('.nz/' in each_l) or ('.au/' in each_l) or ('.biz/' in each_l)): # print(each) filtered_received_links.append(each_l) filtered_sr_a.append(sr[i]) print(filtered_received_links) print(filtered_sr_a) received_domains = [i.split("/")[2] for i in filtered_received_links] filtered_sr = [] print('rd', received_domains) for i, each in enumerate(received_domains): print(each) if (('.gov.' in each) or ('.govt.' in each) or ('.edu.' in each) or ('.uk' in each)): # filter non wanted websites continue if each not in black_list: filtered_sr.append(filtered_sr_a[i]) if (len(filtered_sr)): #is the link already taken res_data = is_profile_exist(filtered_sr[0]['link']) print('sss', filtered_sr[0]) if (len(res_data)): print("Profile " + filtered_sr[0]['link'] + " already existing at " + str(res_data[0]['_id'])) return 'exist' filtered_sr[0]['comp_name'] = filtered_sr[0]['search_text'] # filtered_sr[0]['query_id'] = query_entry db_collection.update_one({'_id': ObjectId(entry_id)}, {'$set': filtered_sr[0]}) # record_entry=db_collection.insert_one(filtered_sr[0]) print(filtered_sr[0]) print("search record stored in db updated ") print(entry_id) return entry_id else: print("No results found!") return None
def search_a_query(search_query, number_of_results, db_collection, query_entry): try: with open('check_relevence_risk_rich_ocu.csv', mode='w', encoding='utf8', newline='' ) as results_file: # store search results in to a csv file results_writer = csv.writer(results_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) attributes_a = [ 'search_text', 'title', 'link', 'description', 'rich_description', 'comp_name', 'match_count' ] results_writer.writerow(attributes_a) sr = getGoogleLinksForSearchText(search_query, number_of_results, 'normal') # print('sr',sr) count = 0 while (sr == 'captcha'): count = count + 1 print('captch detected and sleeping for n times n:', count) time.sleep(1200 * count) sr = getGoogleLinksForSearchText(search_query, number_of_results, 'normal') if (len(sr)): # print(sr) # record_entry = db_collection.insert_many(sr) for each_sr in sr: print(each_sr) received_links = [i['link'] for i in sr] filtered_received_links = [] for each_l in received_links: if (('.com/' in each_l) or ('.education/' in each_l) or ('.io/' in each_l) or ('.com.au/' in each_l) or ('.net/' in each_l) or ('.org/' in each_l) or ('.co.nz/' in each_l) or ('.nz/' in each_l) or ('.au/' in each_l) or ('.biz/' in each_l)): # print(each) filtered_received_links.append(each_l) received_domains = [ i.split("/")[2] for i in filtered_received_links ] print("received_domains", received_domains) received_domains = list(set(received_domains)) print("received_domains_no_duplicates", received_domains) ids_list = [] for k in range(len(received_domains)): print("*****") time.sleep(10) print(received_links[k], received_domains[k]) b_list_file = open( three_up + '//Simplified_System//Initial_Crawling//black_list.txt', 'r') black_list = b_list_file.read().splitlines() if (received_domains[k] in black_list): #filter non wanted websites print("skipping as included in blacklist") continue if (('.gov.' in received_domains[k]) or ('.govt.' in received_domains[k]) or ('.edu.' in received_domains[k]) or ('.uk' in received_domains[k]) ): # filter non wanted websites print("skipping as govt site") continue sr = getGoogleLinksForSearchText(received_domains[k], 3, 'initial') if (len(sr) == 0): sr = getGoogleLinksForSearchText( received_domains[k], 3, 'initial') if (len(sr) == 0): sr = getGoogleLinksForSearchText( received_domains[k], 3, 'initial') if (len(sr) > 0): print(sr[0]) res_data = is_profile_exist(sr[0]['link']) if (len(res_data)): print("Profile " + sr[0]['link'] + " already existing at " + str(res_data[0]['_id'])) #updating associates # query_collection = refer_query_col() # qq_data_entry = query_collection.find({"_id": query_entry}) # qq_data = [i for i in qq_data_entry] # qq_attribute_keys = list(qq_data[0].keys()) # if ('associated_entries' in qq_attribute_keys): # print('in main',) # query_collection.update_one({'_id': query_entry}, # {'$set': { # 'associated_entries': qq_data[0]['associated_entries'] + # [res_data[0]['_id']]}}) # else: # query_collection.update_one({'_id': query_entry}, # {'$set': {'associated_entries': [res_data[0]['_id']]}}) # continue sr[0]['search_text'] = search_query try: c_name = received_domains[k].split("www.")[1] except IndexError: c_name = received_domains[k] if ('.com' in c_name): sr[0]['comp_name'] = c_name.split(".com")[0] elif ('.org' in c_name): sr[0]['comp_name'] = c_name.split(".org")[0] elif ('.co' in c_name): sr[0]['comp_name'] = c_name.split(".co")[0] elif ('.edu' in c_name): sr[0]['comp_name'] = c_name.split(".edu")[0] else: sr[0]['comp_name'] = c_name print("search_text", sr[0]['search_text']) print("rich_description", sr[0]['rich_description']) print("selected_result ", sr[0]) s_text_fixed = sr[0]['search_text'].replace( 'australia', '') print('filtered', s_text_fixed) match_count = calculate_score( s_text_fixed, sr[0]['rich_description']) results_writer.writerow([ sr[0]['search_text'], sr[0]['title'], sr[0]['link'], sr[0]['description'], sr[0]['rich_description'], sr[0]['comp_name'], match_count ]) # sr[0]['query_id'] = query_entry # record_entry = db_collection.insert_one(sr[0]) # print("search record stored in db: ", record_entry.inserted_id) # ids_list.append(record_entry.inserted_id) else: print("Cannot find results, skipping company") print(ids_list) results_file.close() return ids_list # print("search records stored in db: ", record_entry.inserted_ids) else: print("No results found!") return None except Exception as e: print("Error occured! try again", e) return 'error'
def cross_check_person_linkedin(def_names, comp_name, status): title_link_list = [] for i in range(len(def_names)): # Defining the search text on google if status == 'nested': name = def_names[i][0] search_text = name.lower() + ' ' + comp_name + ' linkedin' elif status == 'not nested': name = def_names[i] search_text = name.lower() + ' ' + comp_name + ' linkedin' # Searching google with the search text and extracting the titles and the links sr = getGoogleLinksForSearchText(search_text, 3, 'normal') filtered_li = [] for p in sr: if 'linkedin.com' in p['link']: if '/in/' in p['link']: if [p['title'], p['link']] not in title_link_list: if [p['title'], p['link']] not in filtered_li: filtered_li.append([p['title'], p['link']]) title_link_list.extend(filtered_li) # Remove duplicates from the nested list fset = set(frozenset(x) for x in title_link_list) title_link_list = [list(x) for x in fset] names_in_profiles = [] profile_urls = [] # Extract the names and profile urls from the extracted profiles for i in range(len(title_link_list)): if 'linkedin.com' in title_link_list[i][1]: temp = title_link_list[i][0].split(' - ') names_in_profiles.append(temp[0]) profile_urls.append(title_link_list[i][1]) else: temp = title_link_list[i][1].split(' - ') names_in_profiles.append(temp[0]) profile_urls.append(title_link_list[i][0]) scraped_profiles = [] # Scraping the linkedin profiles for i in range(len(profile_urls)): try: profile_dict = scrape_person(profile_urls[i]) except Exception: print('Exception Occurred') continue scraped_profiles.append(profile_dict) # Check if the person is associated with the company, if so extract them persons_associated = check_person_association_comp(scraped_profiles, comp_name, profile_urls) # Check if the persons associated is in the upper hierarchy of the company persons_relevant = check_if_important_person(persons_associated, [ 'co founder', 'co-founder', 'co-founded', 'co founded', 'co found', 'co-found', 'managing director', 'director', ' ceo ', 'CEO', 'ceo', ' coo ', 'founder', 'found', 'founding', 'executive director', 'chief executive officer', 'chief executive', 'chief operating officer', 'owner', 'chairman', 'chairperson' ]) return persons_relevant