Ejemplo n.º 1
0
def getOrg_InfoData(OKPO_number):
    '''
    This function takes OKPO number of a russian company as input parameter, scrape 
    org-info website for this number and return found information of company as a dictionary
    '''
    orgInfo_data = {
        "address": "",
        "phones": [],
        "emails": [],
        "faxes": [],
        "website": ""
    }
    try:
        url = search_org_info_by_OKPO.format(OKPO_number)
        res = getHtmlResponse(url, use_proxy=True)
        if (res):
            soup = getSoup(res)
            if (soup):
                content_div = soup.find('div', class_="content")
                href = content_div.find('a')['href']
                info_page_url = org_info_base_url.format(href)
                res = getHtmlResponse(info_page_url, use_proxy=True)
                if (res):
                    new_soup = getSoup(res)
                    if (new_soup):
                        paras = new_soup.find_all('p')
                        for p in paras:
                            text = p.text
                            if (text.startswith("Legal address:")):
                                orgInfo_data["address"] = translator.translate(
                                    text.split("Legal address:")[1].strip(),
                                    src="en",
                                    dest="ru").text
                            elif (text.startswith("E-mail: ")):
                                mails = (text.split("E-mail: ")[1]).split(',')
                                for mail in mails:
                                    orgInfo_data["emails"].append(mail.strip())
                            elif (text.startswith("Site (www):")):
                                orgInfo_data["website"] = text.split(
                                    "Site (www):")[1].strip()
                            elif (text.startswith("Telephone(s): ")):
                                phones = (
                                    text.split("Telephone(s): ")[1]).split(',')
                                for phone in phones:
                                    m = re.search(rusian_phone_regex, phone)
                                    if (m):
                                        orgInfo_data["phones"].append(
                                            m.group(0))
                            elif (text.startswith("Fax:")):
                                faxes = (text.split("Fax:")[1]).split(',')
                                for fax in faxes:
                                    orgInfo_data["faxes"].append(fax.strip())

        return orgInfo_data
    except:
        return orgInfo_data
Ejemplo n.º 2
0
def verify_facebook_link(link, domain, country, country_code):
    response = getHtmlResponse(link, use_proxy=True)
    if(response):
        soup = getSoup(response)
        if(soup):
            link = soup.find('div', attrs={"data-key": "tab_about"})
            if link:
                about_link = f"https://www.facebook.com{link.a['href']}"
                about_response = getHtmlResponse(about_link, use_proxy=True)
                if(about_response):
                    about_soup = getSoup(about_response)
                    if(about_soup):
                        fb_phone = facebook_phone(about_soup)
                        fb_address = facebook_adress(about_soup)

                        if(fb_phone):
                            phone = "".join(d for d in fb_phone if(re.search("\d", d)))
                            if(phone.startswith(country_code)):
                                phone_match_status = "matched"
                            else:
                                return False
                        else:
                            phone_match_status = "not_found"
                        
                        if(fb_address):
                            if(re.search(country, fb_address, flags=re.IGNORECASE)):
                                address_match_status = "matched"
                            else:
                                address_dic = get_google_formatted_address_using_address(fb_address, "en")
                                if(address_dic):
                                    address_country = translator.translate(address_dic["components"]["country"], dest="en").text
                                    if(country == address_country.lower()):
                                        address_match_status = "matched"
                                    else:
                                        return False
                                else:
                                    address_match_status = "not_found"
                        else:
                            address_match_status = "not_found"
                        
                        if(phone_match_status == "not_found" and address_match_status == "not_found"):
                            items = about_soup.select('._50f4')
                            if(items):
                                main_domain = domain.replace("www.", "")
                                main_domain = main_domain.split("/")[0]
                                for item in items:
                                    if(re.search(main_domain, item.text)):
                                        return True
                                return False
                            else:
                                return False
                        else:
                            return True
    return False
Ejemplo n.º 3
0
def find_VK_link(domain):
    if ("www." in domain):
        domain = domain.split("www.")[1]

    searchtext = "intext:" + domain + " inurl:vk.com"
    searchtext = searchtext.replace(" ", "+").replace(":", "%3A")
    url = 'https://www.google.com/search?q=' + searchtext

    link = None
    response = getHtmlResponse(url, use_proxy=True)
    if (response):
        soup = getSoup(response)
        if (soup):
            search_result = soup.select('.r a:first-child')
            vk_result = []
            for a in search_result:
                a_text = a.text
                h_text = a.find('h3')
                if h_text:
                    vk_result.append({
                        "title": h_text.text,
                        "link": a.get('href')
                    })
            item = [h for h in vk_result if re.search('VK.com', h['title'])]
            if item:
                link = item[0]['link']
    return link
Ejemplo n.º 4
0
def get_facebook_logo(url):
    response = getHtmlResponse(url, use_proxy=True)
    if(response):
        soup = getSoup(response)
        if(soup):
            body = soup.body
            img = body.find("img")
            if(img):
                return img.get("src")
            else:
                return None
        else:
            return None
    else:
        return None
Ejemplo n.º 5
0
def get_baidu_result_divs(url, domain):
    response = getHtmlResponse(url, use_proxy=True, headers=headers)
    if (response):
        soup = getSoup(response)
        print(soup.title)
        if (soup):
            result_container_divs = soup.find_all('div', class_='c-container')
            for d in result_container_divs:
                print(d.text[:100], "\n")
            result_container_divs = [
                container for container in result_container_divs
                if domain in container.text
            ]
            return result_container_divs
        else:
            return []
    else:
        return []
Ejemplo n.º 6
0
def scrape_qichacha(query, search_domain=False):
    qichacha_search_url = 'https://www.qichacha.com/search?key={}'
    my_qichacha_string_cookies = "QCCSESSID=943estaujkcnlmj6617l1qj0d3; UM_distinctid=1719c8ea6862e9-0116078de66c3-5313f6f-100200-1719c8ea68871; CNZZDATA1254842228=997063605-1587466246-%7C1587466246; zg_did=%7B%22did%22%3A%20%221719c8ea8c221b-05c25cf449a31-5313f6f-100200-1719c8ea8c35c9%22%7D; hasShow=1; _uab_collina=158746952916961739338216; acw_tc=2ff62b9e15874695316588568e4820daa195ec4fb6592712661a92bf35; Hm_lvt_78f134d5a9ac3f92524914d0247e70cb=1587469530,1587470690; Hm_lpvt_78f134d5a9ac3f92524914d0247e70cb=1587470721; zg_de1d1a35bfa24ce29bbf2c7eb17e6c4f=%7B%22sid%22%3A%201587469527239%2C%22updated%22%3A%201587470730031%2C%22info%22%3A%201587469527244%2C%22superProperty%22%3A%20%22%7B%7D%22%2C%22platform%22%3A%20%22%7B%7D%22%2C%22utm%22%3A%20%22%7B%7D%22%2C%22referrerDomain%22%3A%20%22%22%2C%22cuid%22%3A%20%221056ee824ee53e234a33f1988ac0ea63%22%7D"

    url = qichacha_search_url.format(query)
    print(url)
    lst = []
    response = getHtmlResponse(
        url,
        cookies=stringCookiesToDict(my_qichacha_string_cookies),
        use_proxy=True)
    if (response):
        soup = getSoup(response)
        if (soup):
            table = soup.find("table")
            if (table):
                td_list = table.find_all("td")
                if (td_list):
                    if search_domain:
                        query = query.replace('www.', "")

                    for td in td_list:
                        matched_tag = td.find(
                            text=re.compile(query)
                        )  # looking for tags with query provided in text
                        if matched_tag:
                            try:
                                imgUrl = matched_tag.find_parent('tr').find(
                                    'img')['src']
                            except AttributeError:
                                imgUrl = ''
                            parent_td = matched_tag.find_parent('td')
                            data = extract_td(parent_td, query, search_domain)
                            data['ImageUrl'] = imgUrl
                            lst.append(data)

            else:
                print('No Table found!')
    return lst
Ejemplo n.º 7
0
def getSparkData(query, check_length=False):
    '''
    This function takes a name or a TIN as first input parameter "query"
    and search Spark website for information of the company. If second parameter is
    set to "False", the default case, the function do not check for unique result 
    and return registered information of first result (in case where the query parameter is 
    a TIN or a domain). If second parameter is set to "True", the function will ensure result 
    to be unique and return registered information of found result (in case where the query parameter is a name).
    '''
    url = spark_search_url.format(query)
    spark_data = {
        "legal_name": '',
        "address": "",
        'TIN': '',
        'founders': [],
        'supervisor': '',
        'registration_date': '',
        'main_activity': '',
        "OKPO": "",
    }
    try:
        # getting registered data from spark
        if (check_length):
            soup = getSoup(getHtmlResponse(url, use_proxy=True))
            if (len(soup.find_all("li",
                                  class_="search-result-list__item")) == 1):
                result = soup.find("li", class_="search-result-list__item")
                new_url = "http://www.spark-interfax.ru/" + result.find(
                    "a")["href"]
                new_soup = getSoup(getHtmlResponse(new_url, use_proxy=True))

                spark_data["legal_name"] = new_soup.find("div",
                                                         attrs={
                                                             "itemprop":
                                                             "legalName"
                                                         }).text
                spark_data["address"] = new_soup.find("div",
                                                      attrs={
                                                          "itemprop": "address"
                                                      }).text
                spark_data["TIN"] = new_soup.find("div",
                                                  attrs={
                                                      "itemprop": "taxID"
                                                  }).text

                name_divs = new_soup.find_all(
                    'div', class_="company-characteristics__name")
                for div in name_divs:
                    if (div.text == "ОКПО"):
                        for d in div.next_siblings:
                            if (not isinstance(d, NavigableString)):
                                if (d['class'][0] ==
                                        "company-characteristics__value"):
                                    spark_data["OKPO"] = d.text
                    elif (div.text == "Руководитель"):
                        for d in div.next_siblings:
                            if (not isinstance(d, NavigableString)):
                                if (d['class'][0] ==
                                        "company-characteristics__value"):
                                    m = re.search('\w[\s\w,]+\w+', d.text)
                                    if (m != None):
                                        spark_data["supervisor"] = m.group(
                                            0).split(',')[0]
                    elif (div.text == "Учредители"):
                        for d in div.next_siblings:
                            if (not isinstance(d, NavigableString)):
                                if (d['class'][0] ==
                                        "company-characteristics__value"):
                                    m = re.search('\w[\s\w,]+\w+', d.text)
                                    if (m != None):
                                        founders = m.group(0).split(',')
                                        for founder in founders:
                                            spark_data["founders"].append(
                                                (re.search(
                                                    '\w+\s*\w+',
                                                    founder)).group(0))
                    elif (div.text == "Дата регистрации"):
                        for d in div.next_siblings:
                            if (not isinstance(d, NavigableString)):
                                if (d['class'][0] ==
                                        "company-characteristics__value"):
                                    spark_data["registration_date"] = d.text

                spark_data["main_activity"] = new_soup.find(
                    'div', class_="okved-list__name").text
        else:
            soup = getSoup(getHtmlResponse(url, use_proxy=True))
            result = soup.find("li", class_="search-result-list__item")
            new_url = "http://www.spark-interfax.ru/" + result.find(
                "a")["href"]
            new_soup = getSoup(getHtmlResponse(new_url, use_proxy=True))

            spark_data["legal_name"] = new_soup.find("div",
                                                     attrs={
                                                         "itemprop":
                                                         "legalName"
                                                     }).text
            spark_data["address"] = new_soup.find("div",
                                                  attrs={
                                                      "itemprop": "address"
                                                  }).text
            spark_data["TIN"] = new_soup.find("div",
                                              attrs={
                                                  "itemprop": "taxID"
                                              }).text

            name_divs = new_soup.find_all(
                "div", class_="company-characteristics__name")
            for div in name_divs:
                if (div.text == "ОКПО"):
                    for d in div.next_siblings:
                        if (not isinstance(d, NavigableString)):
                            if (d['class'][0] ==
                                    "company-characteristics__value"):
                                spark_data["OKPO"] = d.text
                elif (div.text == "Руководитель"):
                    for d in div.next_siblings:
                        if (not isinstance(d, NavigableString)):
                            if (d['class'][0] ==
                                    "company-characteristics__value"):
                                m = re.search('\w[\s\w,]+\w+', d.text)
                                if (m != None):
                                    spark_data["supervisor"] = m.group(
                                        0).split(',')[0]
                elif (div.text == "Учредители"):
                    for d in div.next_siblings:
                        if (not isinstance(d, NavigableString)):
                            if (d['class'][0] ==
                                    "company-characteristics__value"):
                                m = re.search('\w[\s\w,]+\w+', d.text)
                                if (m != None):
                                    founders = m.group(0).split(',')
                                    for founder in founders:
                                        spark_data["founders"].append(
                                            (re.search('\w+\s*\w+',
                                                       founder)).group(0))
                elif (div.text == "Дата регистрации"):
                    for d in div.next_siblings:
                        if (not isinstance(d, NavigableString)):
                            if (d['class'][0] ==
                                    "company-characteristics__value"):
                                spark_data["registration_date"] = d.text

            spark_data["main_activity"] = new_soup.find(
                'div', class_="okved-list__name").text
        return spark_data
    except:
        return spark_data
Ejemplo n.º 8
0
def getSBIS_Data(query, only_contact=True):
    '''
    This function takes a name or a TIN as first input parameter "query"
    and search SBIS website for information of the company. If second parameter is
    set to "True", the default case, the function do not check for unique result 
    and return contact information of first result (in case where the query parameter is a TIN).
    If second parameter is set to "False", the function will ensure result to be unique 
    and return TIN and contact information of found result (in case where the query parameter is a name).
    '''
    if (only_contact):
        sbis_data = {"address": "", "phones": [], "emails": [], "website": ""}
    else:
        sbis_data = {
            "address": "",
            "phones": [],
            "emails": [],
            "website": "",
            "TIN": None
        }
    sbis_query = {
        "jsonrpc": "2.0",
        "protocol": 5,
        "method": "Контрагент.List",
        "params": {
            "Фильтр": {
                "d": [True, None, True, None, query, 1, None],
                "s": [{
                    "t": "Логическое",
                    "n": "Misspelling"
                }, {
                    "t": "Строка",
                    "n": "ИдВидДеятельности"
                }, {
                    "t": "Логическое",
                    "n": "ИскатьВФилиалах"
                }, {
                    "t": "Строка",
                    "n": "Регион"
                }, {
                    "t": "Строка",
                    "n": "Реквизиты"
                }, {
                    "t": "Число целое",
                    "n": "Состояние"
                }, {
                    "t": "Строка",
                    "n": "ТипЛица"
                }],
                "_type":
                "record"
            },
            "Сортировка": {
                "d": [[False, "Релевантность", True]],
                "s": [{
                    "t": "Логическое",
                    "n": "l"
                }, {
                    "t": "Строка",
                    "n": "n"
                }, {
                    "t": "Логическое",
                    "n": "o"
                }],
                "_type":
                "recordset"
            },
            "Навигация": {
                "d": [True, 30, 0],
                "s": [{
                    "t": "Логическое",
                    "n": "ЕстьЕще"
                }, {
                    "t": "Число целое",
                    "n": "РазмерСтраницы"
                }, {
                    "t": "Число целое",
                    "n": "Страница"
                }],
                "_type":
                "record"
            },
            "ДопПоля": []
        },
        "id": 1
    }
    try:
        r = requests.post(sbis_search_url,
                          json=sbis_query,
                          headers=headers,
                          timeout=35)
        json_data = r.json()["result"]["d"]
        if (only_contact
            ):  # query is TIN, No need to check length of json_data
            if (json_data):
                for candidate in json_data:
                    if (len(candidate[3]) == 10):
                        url = sbis_info_url.format(candidate[3], candidate[4])
                        res = getHtmlResponse(url, use_proxy=True)
                        if (res):
                            soup = getSoup(res)
                            if (soup):
                                add_div = soup.find(
                                    "div", class_="cCard__Contacts-Address")
                                if (add_div):
                                    sbis_data["address"] = add_div.text.strip()
                                main_div = soup.find(
                                    'div',
                                    class_=
                                    "cCard__Contacts-Additional cCard__Contacts-Additional-Mobile"
                                )
                                if (main_div):
                                    child_divs = main_div.find_all(
                                        "div", class_="cCard__Contacts-Values")
                                    if (child_divs):
                                        text = ""
                                        for div in child_divs:
                                            a = div.find("a")
                                            if (a):
                                                if (a.get("href").startswith(
                                                        "http")):
                                                    sbis_data[
                                                        "website"] = a.text.strip(
                                                        )
                                                else:
                                                    text += div.text + '\n'
                                            else:
                                                text += div.text + '\n'
                                        matches = [
                                            m.group(0) for m in re.finditer(
                                                rusian_phone_regex, text)
                                        ]
                                        sbis_data["phones"] = matches
                                        matches = [
                                            m.group(0) for m in re.finditer(
                                                email_regex, text)
                                        ]
                                        sbis_data["emails"] = matches

                        break
        else:
            if (json_data and len(json_data) == 1):
                sbis_data["TIN"] = json_data[0][3]
                url = sbis_info_url.format(json_data[0][3], json_data[0][4])
                res = getHtmlResponse(url, use_proxy=True)
                if (res):
                    soup = getSoup(res)
                    if (soup):
                        add_div = soup.find("div",
                                            class_="cCard__Contacts-Address")
                        if (add_div):
                            sbis_data["address"] = add_div.text.strip()
                        main_div = soup.find(
                            'div',
                            class_=
                            "cCard__Contacts-Additional cCard__Contacts-Additional-Mobile"
                        )
                        if (main_div):
                            child_divs = main_div.find_all(
                                "div", class_="cCard__Contacts-Values")
                            if (child_divs):
                                text = ""
                                for div in child_divs:
                                    a = div.find("a")
                                    if (a):
                                        if (a.get("href").startswith("http")):
                                            sbis_data[
                                                "website"] = a.text.strip()
                                        else:
                                            text += div.text + '\n'
                                    else:
                                        text += div.text + '\n'

                                matches = [
                                    m.group(0) for m in re.finditer(
                                        rusian_phone_regex, text)
                                ]
                                sbis_data["phones"] = matches
                                matches = [
                                    m.group(0)
                                    for m in re.finditer(email_regex, text)
                                ]
                                sbis_data["emails"] = matches

        return sbis_data
    except:
        return sbis_data
Ejemplo n.º 9
0
def website_info(domain, org_name, country="global"):
    #domain = validate_domain(domain)
    print('>>>>>>>>>>>>>>>>>> NOW website_info: ', domain, 'Country :',
          country)
    if domain != "-":
        url = "http://" + domain
        print("url >>> ", url)
    else:
        return {"domain": domain, "result": {}}

    country_context = load_country_context(
        country, add_with_global_setting=True
    ) if country else load_country_context("global")
    language = country_context.get("language", "en")
    # getting main-page and contact-page soup object
    main_page_soup = None
    contact_page_soup = None
    res = getHtmlResponse(url, use_proxy=False)
    if (res):
        print("status code >>> ", res.status_code)
        final_url = res.url
        print("final_url: ", final_url)
        main_page_soup = getSoup(res)
        if (main_page_soup):
            # trying to find contact page url
            contact_page_url = find_second_page_url(main_page_soup, final_url,
                                                    country_context)
            if (contact_page_url):
                print("contact_page_url >>> ", contact_page_url)
                res = getHtmlResponse(contact_page_url, use_proxy=False)
                if (res):
                    contact_page_soup = getSoup(res)
                    if (not contact_page_soup):
                        print("No contact page soup")
                else:
                    contact_page_url = find_second_page_url(
                        main_page_soup, url, country_context)
                    if (contact_page_url):
                        print("contact_page_url >>> ", contact_page_url)
                        res = getHtmlResponse(contact_page_url,
                                              use_proxy=False)
                        if (res):
                            contact_page_soup = getSoup(res)
                            if (not contact_page_soup):
                                print("No contact page soup")
                        else:
                            print("No contact page response...")
            else:
                print("No contact page url...")
        else:
            print("No main page soup...")
    else:
        print("No main response...")

    # getting website data using main-page and contact page
    if (main_page_soup):
        website_data = get_website_data(main_page_soup, contact_page_soup,
                                        country, country_context, domain, url,
                                        org_name, language)
        return {"domain": domain, "result": website_data}
    else:
        return {"domain": domain, "result": {}}