def search_a_company():
    try:
        with open('for_listing_websites_edu.csv', mode='w', encoding='utf8',
                  newline='') as results_file:  # store search results in to a csv file
            results_writer = csv.writer(results_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
            attributes_a = ['search_text', 'title', 'link', 'description', 'rich_description', 'comp_name']
            results_writer.writerow(attributes_a)
            f = open('not_found.txt','r')
            missed_list = [link.strip() for link in f.readlines()]
            # print(missed_list)
            for row in missed_list:
                # time.sleep(60)
                print(row)
                sr = getGoogleLinksForSearchText(row+" Australia", 5, 'initial')
                if (len(sr) == 0):
                    sr =  getGoogleLinksForSearchText(row+" Australia", 5, 'initial')
                    if (len(sr) == 0):
                        sr = getGoogleLinksForSearchText(row+" Australia", 5, 'initial')
                count = 0
                while (sr == 'captcha'):
                    count = count + 1
                    print('captch detected and sleeping for n times n:', count)
                    time.sleep(1200 * count)
                    sr = getGoogleLinksForSearchText(row+" Australia", 5, 'initial')
                print(sr[0])
                results_writer.writerow([sr[0]['search_text'], sr[0]['title'], sr[0]['link'], sr[0]['description'],sr[0]['rich_description']])
            results_file.close()



    except Exception as e:
        print("Error occured! try again",e)
        return 'error'

# search_a_company()
コード例 #2
0
def get_li_url(entry_id):

    mycol = refer_collection()
    comp_data_entry = mycol.find({"_id": entry_id})
    data = [i for i in comp_data_entry]
    try:
        sm_links = data[0]['social_media_links']
    except Exception:
        sm_links = []
    linked_in_comp_urls = []
    for each in sm_links:
        if('linkedin.com/company' in each):linked_in_comp_urls.append(each)
    if(len(linked_in_comp_urls)):
        print("Linkedin profile collected from crawled data")
        print("linkedin taken from crawling")
        return linked_in_comp_urls[0]
    else:
        comp_name = data[0]['comp_name']
        print(data[0]['comp_name'])
        sr = getGoogleLinksForSearchText( comp_name + " linkedin australia", 5, 'normal')
        if (len(sr) == 0):
            sr = getGoogleLinksForSearchText(comp_name + " linkedin australia", 5, 'normal')
            if (len(sr) == 0):
                sr = getGoogleLinksForSearchText(comp_name + " linkedin australia", 5, 'normal')

        filtered_li = []
        for p in sr:
            # print(p['link'])
            if 'linkedin.com/company' in p['link']:
                filtered_li.append(p['link'])
        if (len(filtered_li)):
            return filtered_li[0]
        else:
            print("No linkedin contacts found!, Try again")
            return False
コード例 #3
0
def get_cp_oc(entry_id,mode):
    # myclient = pymongo.MongoClient("mongodb://localhost:27017/")
    # mydb = myclient["CompanyDatabase"]  # refer the database
    # mycol = mydb["comp_data"]  # refer the collection
    mycol = refer_collection()
    comp_data_entry = mycol.find({"_id": entry_id})
    data = [i for i in comp_data_entry]
    # comp_name = data[0]['search_text']
    try:
        if mode=='comp':
            comp_name = data[0]['search_text']
        elif mode == 'query':
            comp_name = data[0]['comp_name']
    except KeyError:
        comp_name = data[0]['link'].split("/")[2]

    det=[comp_name]
    sr = getGoogleLinksForSearchText(comp_name + " opencorporates", 3, 'normal')

    filtered_oc = []
    for p in sr:
        if (('opencorporates.com/companies/nz' in p['link']) or ('opencorporates.com/companies/au' in p['link'])):
            filtered_oc.append([p['title'], p['link']])
    if (len(filtered_oc)):
        print(filtered_oc[0])
        det.append(filtered_oc[0])
        det.append(scrape_opencorporates(filtered_oc[0][1]))
        print(det)
        mycol.update_one({'_id': entry_id},
                         {'$set': {'oc_cp_info': det}})
        print("Successfully extended the data entry with opencorporates contact person data", entry_id)
    else:
        print("No opencorporates profile found!, Try again")
        mycol.update_one({'_id': entry_id},
                         {'$set': {'oc_cp_info': det}})
コード例 #4
0
def search_a_company_alpha(comp_name, db_collection, query_entry, c_name):
    sr = getGoogleLinksForSearchText(comp_name, 3, 'normal')
    count = 0
    while (sr == 'captcha'):
        count = count + 1
        print('captch detected and sleeping for n times n:', count)
        time.sleep(1200 * count)
        sr = getGoogleLinksForSearchText(comp_name, 3, 'normal')

    b_list_file = open(
        three_up + '//Simplified_System//Initial_Crawling//black_list.txt',
        'r')
    black_list = b_list_file.read().splitlines()
    # 'www.dnb.com'
    received_links = [i['link'] for i in sr]

    received_domains = [i.split("/")[2] for i in received_links]
    filtered_sr = []

    print('rd', received_domains)
    for i, each in enumerate(received_domains):
        if each not in black_list:
            filtered_sr.append(sr[i])

    if (len(filtered_sr)):

        res_data = is_profile_exist(filtered_sr[0]['link'])

        if (len(res_data)):
            print("Profile " + filtered_sr[0]['link'] +
                  " already existing at " + str(res_data[0]['_id']))
            return 'exist'

        filtered_sr[0]['comp_name'] = c_name
        filtered_sr[0]['query_id'] = query_entry
        record_entry = db_collection.insert_one(filtered_sr[0])
        print(filtered_sr[0])
        print("search record stored in db: ", record_entry.inserted_id)
        return record_entry.inserted_id
    else:
        print("No results found!")
        return None
コード例 #5
0
def get_cp_dnb(entry_id,mode):
    mycol = refer_collection()
    comp_data_entry = mycol.find({"_id": entry_id})
    # print(comp_data_entry)
    data = [i for i in comp_data_entry]
    # comp_name = data[0]['search_text']
    # print(data)
    try:
        if mode=='comp':
            comp_name = data[0]['search_text']
        elif mode == 'query':
            print(data)
            comp_name = data[0]['comp_name']
    except KeyError:
        comp_name = data[0]['link'].split("/")[2]
    det = [comp_name]
    sr = getGoogleLinksForSearchText(comp_name + " dnb.com", 3, 'normal')
    filtered_dnb = []
    for p in sr:
        if 'dnb.com/business-directory/company-profiles' in p['link']:
            filtered_dnb.append([p['title'], p['link']])
    if (len(filtered_dnb)):
        print("dnb profile found and extracting contact persons..")
        print(filtered_dnb[0])
        det.append(filtered_dnb[0])
        print(filtered_dnb[0][1])
        det.append(scrape_dnb(filtered_dnb[0][1]))
        print(det)
        mycol.update_one({'_id': entry_id},
                         {'$set': {'dnb_cp_info': det}})
        print("Successfully extended the data entry with dnb contact person data", entry_id)
    else:
        print("No dnb profile found!,Try again..")
        print(det)
        mycol.update_one({'_id': entry_id},
                         {'$set': {'dnb_cp_info': det}})
コード例 #6
0
def search_a_company(comp_name, db_collection, query_entry):
    try:
        sr = getGoogleLinksForSearchText(comp_name + " Australia", 5, 'normal')
        count = 0
        while (sr == 'captcha'):
            count = count + 1
            print('captch detected and sleeping for n times n:', count)
            time.sleep(1200 * count)
            sr = getGoogleLinksForSearchText(comp_name, 5, 'normal')

        b_list_file = open(
            three_up + '//Simplified_System//Initial_Crawling//black_list.txt',
            'r')
        black_list = b_list_file.read().splitlines()
        # 'www.dnb.com'
        received_links = [i['link'] for i in sr]
        print(received_links)
        #filter the links
        filtered_sr_a = []
        filtered_received_links = []
        for i, each_l in enumerate(received_links):
            if (('.com/' in each_l) or ('.education/' in each_l)
                    or ('.io/' in each_l) or ('.com.au/' in each_l)
                    or ('.net/' in each_l) or ('.org/' in each_l)
                    or ('.co.nz/' in each_l) or ('.nz/' in each_l)
                    or ('.au/' in each_l) or ('.biz/' in each_l)):
                # print(each)
                filtered_received_links.append(each_l)
                filtered_sr_a.append(sr[i])

        print(filtered_sr_a)
        received_domains = [i.split("/")[2] for i in filtered_received_links]
        filtered_sr = []

        print('rd', received_domains)
        for i, each in enumerate(received_domains):
            # print(each)
            if (('.gov.' in each) or ('.govt.' in each) or ('.edu.' in each)
                    or ('.uk' in each)):  # filter non wanted websites
                continue
            if each not in black_list:
                filtered_sr.append(filtered_sr_a[i])

        if (len(filtered_sr)):
            #is the link already taken
            res_data = is_profile_exist(filtered_sr[0]['link'])

            if (len(res_data)):
                print("Profile " + filtered_sr[0]['link'] +
                      " already existing at " + str(res_data[0]['_id']))
                return 'exist'
            #should fix comp name
            # print('fixing comp name')
            c_n_link = filtered_sr[0]['link']
            c_n_dom = c_n_link.split("/")[2]
            try:
                c_name = c_n_dom.split("www.")[1]
            except IndexError:
                c_name = c_n_dom
            if ('.com' in c_name):
                cc_name = c_name.split(".com")[0]
            elif ('.org' in c_name):
                cc_name = c_name.split(".org")[0]
            elif ('.io' in c_name):
                cc_name = c_name.split(".io")[0]
            elif ('.net' in c_name):
                cc_name = c_name.split(".net")[0]
            else:
                cc_name = c_name
            # print(filtered_sr[0]['link'])
            filtered_sr[0]['comp_name'] = cc_name
            filtered_sr[0]['query_id'] = query_entry
            record_entry = db_collection.insert_one(filtered_sr[0])
            print(filtered_sr[0])
            print("search record stored in db: ", record_entry.inserted_id)
            return record_entry.inserted_id
        else:
            print("No results found!")
            return None
    except Exception as e:
        print("Error occured! try again", e)
        return 'error'
コード例 #7
0
def search_a_query(search_query, number_of_results, db_collection,
                   query_entry):
    try:
        sr = getGoogleLinksForSearchText(search_query, number_of_results,
                                         'normal')
        count = 0
        while (sr == 'captcha'):
            count = count + 1
            print('captch detected and sleeping for n times n:', count)
            time.sleep(1200 * count)
            sr = getGoogleLinksForSearchText(search_query, number_of_results,
                                             'normal')

        if (len(sr)):
            # print(sr)
            # record_entry = db_collection.insert_many(sr)

            for each_sr in sr:
                print(each_sr)
            received_links = [i['link'] for i in sr]
            filtered_received_links = []
            for each_l in received_links:
                if (('.com/' in each_l) or ('.education/' in each_l)
                        or ('.io/' in each_l) or ('.com.au/' in each_l)
                        or ('.net/' in each_l) or ('.org/' in each_l)
                        or ('.co.nz/' in each_l) or ('.nz/' in each_l)
                        or ('.au/' in each_l) or ('.biz/' in each_l)):
                    # print(each)
                    filtered_received_links.append(each_l)

            received_domains = [
                i.split("/")[2] for i in filtered_received_links
            ]
            print("received_domains", received_domains)
            received_domains = list(set(received_domains))
            print("received_domains", received_domains)
            ids_list = []
            for k in range(len(received_domains)):
                time.sleep(10)
                print(received_links[k], received_domains[k])
                b_list_file = open(
                    three_up +
                    '//Simplified_System//Initial_Crawling//black_list.txt',
                    'r')
                black_list = b_list_file.read().splitlines()
                if (received_domains[k]
                        in black_list):  #filter non wanted websites
                    continue
                if (('.gov.' in received_domains[k])
                        or ('.govt.' in received_domains[k])
                        or ('.edu.' in received_domains[k]) or
                    ('.uk'
                     in received_domains[k])):  # filter non wanted websites
                    continue
                sr = getGoogleLinksForSearchText(received_domains[k], 3,
                                                 'normal')
                if (len(sr) == 0):
                    sr = getGoogleLinksForSearchText(received_domains[k], 3,
                                                     'normal')
                    if (len(sr) == 0):
                        sr = getGoogleLinksForSearchText(
                            received_domains[k], 3, 'normal')
                if (len(sr) > 0):
                    print(sr[0])
                    res_data = is_profile_exist(sr[0]['link'])
                    if (len(res_data)):
                        print("Profile " + sr[0]['link'] +
                              " already existing at " +
                              str(res_data[0]['_id']))
                        continue
                    sr[0]['search_text'] = search_query
                    try:
                        c_name = received_domains[k].split("www.")[1]
                    except IndexError:
                        c_name = received_domains[k]
                    if ('.com' in c_name):
                        sr[0]['comp_name'] = c_name.split(".com")[0]
                    elif ('.org' in c_name):
                        sr[0]['comp_name'] = c_name.split(".org")[0]
                    elif ('.co' in c_name):
                        sr[0]['comp_name'] = c_name.split(".co")[0]
                    elif ('.edu' in c_name):
                        sr[0]['comp_name'] = c_name.split(".edu")[0]
                    else:
                        sr[0]['comp_name'] = c_name
                    print(sr[0])
                    sr[0]['query_id'] = query_entry
                    record_entry = db_collection.insert_one(sr[0])
                    print("search record stored in db: ",
                          record_entry.inserted_id)
                    ids_list.append(record_entry.inserted_id)
                else:
                    print("Cannot find results, skipping company")
            print(ids_list)
            return ids_list
            # print("search records stored in db: ", record_entry.inserted_ids)
        else:
            print("No results found!")
            return None
    except Exception as e:
        print("Error occured! try again", e)
        return 'error'

    #store file to a csv file
    # with open('search_results.csv', mode='w',encoding='utf8') as results_file:  # store search results in to a csv file
    #     results_writer = csv.writer(results_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    #
    #     for each_item in sr:
    #         results_writer.writerow([each_item['title'], each_item['link'], each_item['description']])
    #     results_file.close()


# mycol = refer_collection()
# search_a_company('TOOHEYS PTY LIMITED',mycol)
# search_a_company('CALTEX PETROLEUM PTY LTD',mycol)
# search_a_query('Digital advertisement and marketing analytics services company',5,mycol)
コード例 #8
0
def update_a_company(comp_name, db_collection, entry_id):
    print(entry_id)
    sr = getGoogleLinksForSearchText(comp_name, 50, 'normal')
    count = 0
    while (sr == 'captcha'):
        count = count + 1
        print('captch detected and sleeping for n times n:', count)
        time.sleep(1200 * count)
        sr = getGoogleLinksForSearchText(comp_name, 50, 'normal')

    b_list_file = open(
        three_up + '//Simplified_System//Initial_Crawling//black_list.txt',
        'r')
    black_list = b_list_file.read().splitlines()
    # 'www.dnb.com'
    received_links = [i['link'] for i in sr]
    print(received_links)
    #filter the links
    filtered_received_links = []
    filtered_sr_a = []
    for i, each_l in enumerate(received_links):
        if (('.com/' in each_l) or ('.education/' in each_l)
                or ('.io/' in each_l) or ('.com.au/' in each_l)
                or ('.net/' in each_l) or ('.org/' in each_l)
                or ('.co.nz/' in each_l) or ('.nz/' in each_l)
                or ('.au/' in each_l) or ('.biz/' in each_l)):
            # print(each)
            filtered_received_links.append(each_l)
            filtered_sr_a.append(sr[i])

    print(filtered_received_links)
    print(filtered_sr_a)
    received_domains = [i.split("/")[2] for i in filtered_received_links]
    filtered_sr = []

    print('rd', received_domains)
    for i, each in enumerate(received_domains):
        print(each)
        if (('.gov.' in each) or ('.govt.' in each) or ('.edu.' in each)
                or ('.uk' in each)):  # filter non wanted websites
            continue
        if each not in black_list:
            filtered_sr.append(filtered_sr_a[i])

    if (len(filtered_sr)):
        #is the link already taken
        res_data = is_profile_exist(filtered_sr[0]['link'])
        print('sss', filtered_sr[0])

        if (len(res_data)):
            print("Profile " + filtered_sr[0]['link'] +
                  " already existing at " + str(res_data[0]['_id']))
            return 'exist'

        filtered_sr[0]['comp_name'] = filtered_sr[0]['search_text']
        # filtered_sr[0]['query_id'] = query_entry

        db_collection.update_one({'_id': ObjectId(entry_id)},
                                 {'$set': filtered_sr[0]})
        # record_entry=db_collection.insert_one(filtered_sr[0])
        print(filtered_sr[0])
        print("search record stored in db updated ")
        print(entry_id)
        return entry_id
    else:
        print("No results found!")
        return None
コード例 #9
0
def search_a_query(search_query, number_of_results, db_collection,
                   query_entry):

    try:
        with open('check_relevence_risk_rich_ocu.csv',
                  mode='w',
                  encoding='utf8',
                  newline=''
                  ) as results_file:  # store search results in to a csv file
            results_writer = csv.writer(results_file,
                                        delimiter=',',
                                        quotechar='"',
                                        quoting=csv.QUOTE_MINIMAL)
            attributes_a = [
                'search_text', 'title', 'link', 'description',
                'rich_description', 'comp_name', 'match_count'
            ]
            results_writer.writerow(attributes_a)
            sr = getGoogleLinksForSearchText(search_query, number_of_results,
                                             'normal')
            # print('sr',sr)
            count = 0
            while (sr == 'captcha'):
                count = count + 1
                print('captch detected and sleeping for n times n:', count)
                time.sleep(1200 * count)
                sr = getGoogleLinksForSearchText(search_query,
                                                 number_of_results, 'normal')

            if (len(sr)):
                # print(sr)
                # record_entry = db_collection.insert_many(sr)

                for each_sr in sr:
                    print(each_sr)
                received_links = [i['link'] for i in sr]
                filtered_received_links = []
                for each_l in received_links:
                    if (('.com/' in each_l) or ('.education/' in each_l)
                            or ('.io/' in each_l) or ('.com.au/' in each_l)
                            or ('.net/' in each_l) or ('.org/' in each_l)
                            or ('.co.nz/' in each_l) or ('.nz/' in each_l)
                            or ('.au/' in each_l) or ('.biz/' in each_l)):
                        # print(each)
                        filtered_received_links.append(each_l)

                received_domains = [
                    i.split("/")[2] for i in filtered_received_links
                ]
                print("received_domains", received_domains)
                received_domains = list(set(received_domains))
                print("received_domains_no_duplicates", received_domains)
                ids_list = []
                for k in range(len(received_domains)):
                    print("*****")
                    time.sleep(10)
                    print(received_links[k], received_domains[k])
                    b_list_file = open(
                        three_up +
                        '//Simplified_System//Initial_Crawling//black_list.txt',
                        'r')
                    black_list = b_list_file.read().splitlines()
                    if (received_domains[k]
                            in black_list):  #filter non wanted websites
                        print("skipping as included in blacklist")
                        continue
                    if (('.gov.' in received_domains[k])
                            or ('.govt.' in received_domains[k])
                            or ('.edu.' in received_domains[k])
                            or ('.uk' in received_domains[k])
                        ):  # filter non wanted websites
                        print("skipping as govt site")
                        continue
                    sr = getGoogleLinksForSearchText(received_domains[k], 3,
                                                     'initial')
                    if (len(sr) == 0):
                        sr = getGoogleLinksForSearchText(
                            received_domains[k], 3, 'initial')
                        if (len(sr) == 0):
                            sr = getGoogleLinksForSearchText(
                                received_domains[k], 3, 'initial')
                    if (len(sr) > 0):
                        print(sr[0])
                        res_data = is_profile_exist(sr[0]['link'])
                        if (len(res_data)):
                            print("Profile " + sr[0]['link'] +
                                  " already existing at " +
                                  str(res_data[0]['_id']))
                            #updating associates
                            # query_collection = refer_query_col()
                            # qq_data_entry = query_collection.find({"_id": query_entry})
                            # qq_data = [i for i in qq_data_entry]
                            # qq_attribute_keys = list(qq_data[0].keys())
                            # if ('associated_entries' in qq_attribute_keys):
                            #     print('in main',)
                            #     query_collection.update_one({'_id': query_entry},
                            #                                 {'$set': {
                            #                                     'associated_entries': qq_data[0]['associated_entries'] +
                            #                                         [res_data[0]['_id']]}})
                            # else:
                            #     query_collection.update_one({'_id': query_entry},
                            #                                 {'$set': {'associated_entries': [res_data[0]['_id']]}})
                            # continue
                        sr[0]['search_text'] = search_query
                        try:
                            c_name = received_domains[k].split("www.")[1]
                        except IndexError:
                            c_name = received_domains[k]
                        if ('.com' in c_name):
                            sr[0]['comp_name'] = c_name.split(".com")[0]
                        elif ('.org' in c_name):
                            sr[0]['comp_name'] = c_name.split(".org")[0]
                        elif ('.co' in c_name):
                            sr[0]['comp_name'] = c_name.split(".co")[0]
                        elif ('.edu' in c_name):
                            sr[0]['comp_name'] = c_name.split(".edu")[0]
                        else:
                            sr[0]['comp_name'] = c_name
                        print("search_text", sr[0]['search_text'])
                        print("rich_description", sr[0]['rich_description'])
                        print("selected_result ", sr[0])
                        s_text_fixed = sr[0]['search_text'].replace(
                            'australia', '')
                        print('filtered', s_text_fixed)
                        match_count = calculate_score(
                            s_text_fixed, sr[0]['rich_description'])
                        results_writer.writerow([
                            sr[0]['search_text'], sr[0]['title'],
                            sr[0]['link'], sr[0]['description'],
                            sr[0]['rich_description'], sr[0]['comp_name'],
                            match_count
                        ])
                        # sr[0]['query_id'] = query_entry
                        # record_entry = db_collection.insert_one(sr[0])
                        # print("search record stored in db: ", record_entry.inserted_id)
                        # ids_list.append(record_entry.inserted_id)
                    else:
                        print("Cannot find results, skipping company")
                print(ids_list)
                results_file.close()
                return ids_list
                # print("search records stored in db: ", record_entry.inserted_ids)
            else:
                print("No results found!")
                return None
    except Exception as e:
        print("Error occured! try again", e)
        return 'error'
コード例 #10
0
def cross_check_person_linkedin(def_names, comp_name, status):

    title_link_list = []
    for i in range(len(def_names)):

        # Defining the search text on google
        if status == 'nested':
            name = def_names[i][0]
            search_text = name.lower() + ' ' + comp_name + ' linkedin'
        elif status == 'not nested':
            name = def_names[i]
            search_text = name.lower() + ' ' + comp_name + ' linkedin'

        # Searching google with the search text and extracting the titles and the links
        sr = getGoogleLinksForSearchText(search_text, 3, 'normal')
        filtered_li = []
        for p in sr:
            if 'linkedin.com' in p['link']:
                if '/in/' in p['link']:
                    if [p['title'], p['link']] not in title_link_list:
                        if [p['title'], p['link']] not in filtered_li:
                            filtered_li.append([p['title'], p['link']])

            title_link_list.extend(filtered_li)

    # Remove duplicates from the nested list
    fset = set(frozenset(x) for x in title_link_list)
    title_link_list = [list(x) for x in fset]

    names_in_profiles = []
    profile_urls = []
    # Extract the names and profile urls from the extracted profiles
    for i in range(len(title_link_list)):
        if 'linkedin.com' in title_link_list[i][1]:
            temp = title_link_list[i][0].split(' - ')
            names_in_profiles.append(temp[0])
            profile_urls.append(title_link_list[i][1])
        else:
            temp = title_link_list[i][1].split(' - ')
            names_in_profiles.append(temp[0])
            profile_urls.append(title_link_list[i][0])

    scraped_profiles = []
    # Scraping the linkedin profiles
    for i in range(len(profile_urls)):
        try:
            profile_dict = scrape_person(profile_urls[i])
        except Exception:
            print('Exception Occurred')
            continue
        scraped_profiles.append(profile_dict)

    # Check if the person is associated with the company, if so extract them
    persons_associated = check_person_association_comp(scraped_profiles,
                                                       comp_name, profile_urls)
    # Check if the persons associated is in the upper hierarchy of the company
    persons_relevant = check_if_important_person(persons_associated, [
        'co founder', 'co-founder', 'co-founded', 'co founded', 'co found',
        'co-found', 'managing director', 'director', ' ceo ', 'CEO', 'ceo',
        ' coo ', 'founder', 'found', 'founding', 'executive director',
        'chief executive officer', 'chief executive',
        'chief operating officer', 'owner', 'chairman', 'chairperson'
    ])

    return persons_relevant