コード例 #1
0
    def fase2(url):
        # fase 2.1
        page_user = requests.get(url)
        soup_user = BeautifulSoup(page_user.content, 'html.parser')
        name_vendedor = soup_user.find_all(
            "h3", {"class": "store-info__name"})[0].contents[0]
        Califications = soup_user.find_all(
            "span", {"class": "buyers-feedback-qualification"})
        calification_points = []
        for C in Califications:
            calification_points.append(C.contents[4])

        reputation = soup_user.find_all("div",
                                        {"class": "data-level__wrapper"})[0]
        reputation = reputation.find_all("span",
                                         {"class": "data-level__number"})
        recomendado = reputation[0].contents[0]
        ventas_completadas = reputation[1].contents[0]
        años_vendiendo = reputation[-1].contents[0]

        time = soup_user.find_all("p",
                                  {"class": "data-level__description"})[-1]
        time = time.find_all("span")[0].contents[-1].split(' ')[1]
        if (time == 'años'):
            time = 'anios'

        return [
            time, calification_points, recomendado, ventas_completadas,
            años_vendiendo, name_vendedor
        ]
コード例 #2
0
ファイル: bot.py プロジェクト: samH99/LexisNexis
def get_projects(page):
    ret_projects = []
    curr = page.find("div",
                     {"class": "editable-item section-item current-position"})
    t_curr = {}
    try:
        t_curr["title"] = str(curr.find("a", {"name": "title"}).text)
    except:
        t_curr["title"] = "?"

    aux_str = ""

    time = curr.find("span", {"class": "experience-date-locale"})
    try:
        t_curr["duration"] = str(time.find("time").text + "Present")
    except:
        t_curr["duration"] = "?"
    try:
        t_curr["place"] = str(curr.find("span", {"class": "locality"}).text)
    except:
        t_curr["place"] = "?"
    ret_projects.append(t_curr)
    for link in page.find_all(
            "div", {"class": "editable-item section-item past-position"}):
        temp = {}
        try:
            temp["title"] = str(link.find("a", {"name": "title"}).text)
        except:
            temp["title"] = "?"
        try:

            time = link.find("span", {"class": "experience-date-locale"})
            aux_str = ""
            for tim in time.find_all("time"):
                aux_str += tim.text
            # print aux_str
            temp["duration"] = aux_str
        except:
            temp["duration"] = "?"
        try:
            temp["place"] = str(link.find("span", {"class": "locality"}).text)
        except:
            temp["place"] = "?"
        ret_projects.append(temp)

    return ret_projects
コード例 #3
0
ファイル: Crawler.py プロジェクト: youthink0/LH_CS_project
        a = ho.find_all('span', {'class': 'xQ82C e8fRJf'})[0].text
        a = a.split(' ')[0]
        hosttest.append(a)
        #print(a)

    #print('==============================')
    for url in url_list:
        #擷取新聞網址
        a = url.find_all('a')[0]['href']
        urltest.append(a)
        #print(a)

    #print('==============================')
    for time in time_list:
        #擷取新聞發布時間
        a = time.find_all('span', {'class': 'f nsa fwzPFf'})[0].text
        timetest.append(a)
        #print(a)

    #print('==============================')

    count = TestIdCount
    for i in range(len(titletest)):
        dateflag = 0
        if (hosttest not in bad_host):
            count = count + 1
            if IgnoreDateFlag is not 2:
                to_append = [
                    int(count), titletest[i], "neutral", hosttest[i],
                    urltest[i], timetest[i], ""
                ]
        try:
            grade = curr_section.find(
                'p', {
                    'class':
                    'pv-entity__secondary-title pv-entity__grade t-14 t-black t-normal'
                }).find('span', {'class': 'pv-entity__comma-item'})
            grades.append(grade.get_text())
        except Exception as e:
            # print("Education grade Exception",e)
            grades.append('')

        try:
            time = curr_section.find(
                'p',
                {'class': 'pv-entity__dates t-14 t-black--light t-normal'})
            dates.append((time.find_all('time')[1].get_text()))
        except Exception as e:
            # print("Education time Exception",e)
            dates.append('')

    for i in range(len(edu_section)):
        education_info_list.append([
            college_names[i], degree_names[i], field_names[i], dates[i],
            grades[i]
        ])
except Exception as e:
    # no education added
    # print("Education Section Exception", e)
    pass

# print(education_info_list)
コード例 #5
0
ファイル: Classifier.py プロジェクト: youthink0/LH_CS_project
def crawl(keyword,sdates,edates):
    titletest = list()
    hosttest = list()
    urltest = list()
    timetest = list()
    to_append = list()
    dic = ["id", "title", "category", "host", "url", "time", "segment",]
 
    
    bad_host = ['中工网','中国奥林匹克委员会','星洲网','China Press','手机网易网','新浪网','东方财富网','千龙网','搜狐'
               ,'中国新闻网','汉丰网','京报网','人民网','中国侨网','杭州网','中华人民共和国外交部','华体网','NTDTV','新京报'
               ,'联合国新闻','自由亚洲电台','法国国际广播电台','多维新闻网','BBC 中文网','青年日報','联合早报','新浪网']
    df1 = pd.DataFrame(columns=dic)
    TestIdCount = 0
    
    IgnoreDateFlag = 0
    
    StartNums = sdates
    
    if StartNums is '':
        IgnoreDateFlag = 1
        print('no limit')
    else:
        nums_tmp = StartNums.split('/')
        snums = []
        for n in nums_tmp:
            if n.isdigit():
                snums += [int(n)]

            if len(snums) == 3:
                break;
        if len(snums) != 3:
            IgnoreDateFlag = 2
            print('wrong date form')
    a = 1
    
    EndNums = edates
    
    if EndNums is '':
        IgnoreDateFlag = 1
        print('no limit')
    else:
        nums_tmp = EndNums.split('/')
        enums = []
        for n in nums_tmp:
            if n.isdigit():
                enums += [int(n)]

            if len(enums) == 3:
                break;
        if len(enums) != 3:
            IgnoreDateFlag = 2
            print('wrong date form')
       
    headers = {
    "User-Agent":
        "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36"
    }
    if IgnoreDateFlag is 1:
         payload = {'q': str(keyword), 'tbm':'nws', 'lr':'lang_zh-TW', 'num':'100', 'tbs':'qdr:y'}
        
    elif IgnoreDateFlag is 0:
        payload = {'q': str(keyword), 'tbm':'nws', 'lr':'lang_zh-TW', 'num':'100', 'tbs':'cdr:1,cd_min:' \
                   +str(snums[1])+'/'+str(snums[2])+'/'+str(snums[0]) \
                   +',cd_max:'+str(enums[1])+'/'+str(enums[2])+'/'+str(enums[0])}
        
    s = requests.Session()
    re = s.get("https://www.google.com.tw/search",params = payload, headers = headers)
    print(re.url)
        
    content = re.text
    #print(content)
    soup = BeautifulSoup(content,"html.parser")
    title_list = soup.find_all('div',{'class':'gG0TJc'})
    host_list = soup.find_all('div',{'class':'gG0TJc'})
    url_list = soup.find_all('div',{'class':'gG0TJc'})
    time_list = soup.find_all('div',{'class':'gG0TJc'})
    
    for ti in title_list:
        #擷取新聞標題
        a = ti.find_all('a')[0].text
        titletest.append(a)
        #print(a)
        
    for ho in host_list:
        #擷取新聞網
        a = ho.find_all('span',{'class':'xQ82C e8fRJf'})[0].text
        a = a.split(' ')[0]
        hosttest.append(a)
        #print(a)
        
    for url in url_list:
        #擷取新聞網址
        a = url.find_all('a')[0]['href']
        urltest.append(a)
        #print(a)
    
    for time in time_list:
        #擷取新聞發布時間
        a = time.find_all('span',{'class':'f nsa fwzPFf'})[0].text
        timetest.append(a)
        #print(a)
    
    count = TestIdCount
    for i in range(len(titletest)):
        dateflag = 0
        if(hosttest not in bad_host):
            count = count + 1
            if IgnoreDateFlag is not 2:
                to_append = [int(count),titletest[i],"neutral",hosttest[i],urltest[i],timetest[i],""]
                a_series = pd.Series(to_append, index = df1.columns)
                df1 = df1.append(a_series, ignore_index=True)
    
                '''
                print('======[',i,']=========')
                print(titletest[i])
                print(urltest[i])
                print(hosttest[i])
                print(timetest[i])
                print(" ")
                '''
        to_append.clear()
        
    TestIdCount = TestIdCount + len(titletest)
    count = 0
    
    #print(to_append)
    titletest.clear()
    hosttest.clear()
    urltest.clear()
    timetest.clear()
    
    df1.to_csv('test.csv', index=False, encoding='UTF-8_sig')
def scrapper(link):

    #Initialize Options to start Chrome as headless in selenium

    #Initialize the chrome webdriver as 'browser'

    #Get the login page for linkedin
    browser.get('https://www.linkedin.com/uas/login')

    #Open the file with the username and password for LinkedIn login
    file = open('config.txt')
    lines = file.readlines()
    username = lines[0]
    password = lines[1]

    #Username and Password for login
    elementID = browser.find_element_by_id('username')
    elementID.send_keys(username)

    elementID = browser.find_element_by_id('password')
    elementID.send_keys(password)

    elementID.submit()
    global time
    time.sleep(5)

    #Profile Link to be scraped
    #link = "https://www.linkedin.com/in/rishab-saini/"
    browser.get(link)

    #pause before scrolling
    SCROLL_PAUSE_TIME = 6

    #Get the scroll height of the page
    last_height = browser.execute_script("return document.body.scrollHeight")

    #scroll the entire page due to dynamic loading of the webpage we need to load the entire webpage by scrolling
    for i in range(3):
        # Scroll down to bottom
        browser.execute_script(
            "window.scrollTo(0, document.body.scrollHeight/3);")
        time.sleep(SCROLL_PAUSE_TIME / 2)
        browser.execute_script(
            "window.scrollTo(0, document.body.scrollHeight*(2/3));")
        time.sleep(SCROLL_PAUSE_TIME / 2)
        browser.execute_script(
            "window.scrollTo(0, document.body.scrollHeight);")
        # Wait to load page
        time.sleep(SCROLL_PAUSE_TIME)

        # Calculate new scroll height and compare with last scroll height
        new_height = browser.execute_script(
            "return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

    #try to expand sections(if available), else pass
    try:
        #click to expand education section
        education_expand_button = browser.find_element_by_xpath(
            "//section[@id='education-section']//button[@class='pv-profile-section__see-more-inline pv-profile-section__text-truncate-toggle link link-without-hover-state']"
        )
        browser.execute_script("arguments[0].click();",
                               education_expand_button)
    except Exception as e:
        #print("education_expand_button Exception:", e)
        pass

    try:
        #click to expand projects section
        projects_expand_button = browser.find_element_by_xpath(
            "//div[@class='pv-accomplishments-block__content break-words']//button[@aria-label='Expand projects section' and @aria-expanded='false']"
        )
        browser.execute_script("arguments[0].click();", projects_expand_button)
    except Exception as e:
        # print("projects_expand_button Exception:", e)
        pass

    try:
        #click to expand certifications section
        certifications_expand_button = browser.find_element_by_xpath(
            "//button[@class='pv-profile-section__see-more-inline pv-profile-section__text-truncate-toggle link link-without-hover-state']"
        )
        browser.execute_script("arguments[0].click();",
                               certifications_expand_button)
    except Exception as e:
        # print("certifications_expand_button Exception:", e)
        pass

    try:
        # click to expand experience section
        experiences_expand_button = browser.find_element_by_xpath(
            "//button[@class='pv-profile-section__see-more-inline pv-profile-section__text-truncate-toggle link link-without-hover-state']"
        )
        browser.execute_script("arguments[0].click();",
                               experiences_expand_button)

        time.sleep(2)

        #inline-show-more-text__button link
        experiences_show_more_expand_button = browser.find_element_by_xpath(
            "//button[@class='inline-show-more-text__button link']")
        #print(experiences_show_more_expand_button)
        browser.execute_script("arguments[0].click();",
                               experiences_show_more_expand_button)
    except Exception as e:
        # print("experiences_expand_button Exception:", e)
        pass

    try:
        # click to expand skills section
        skills_expand_button = browser.find_element_by_xpath(
            "//button[@class='pv-profile-section__card-action-bar pv-skills-section__additional-skills artdeco-container-card-action-bar artdeco-button artdeco-button--tertiary artdeco-button--3 artdeco-button--fluid']"
        )
        browser.execute_script("arguments[0].click();", skills_expand_button)
    except Exception as e:
        # print("skills_expand_button Exception:", e)
        pass

    try:
        # click to expand volunteering section
        volunteer_expand_button = browser.find_element_by_xpath(
            "//button[@class='pv-profile-section__see-more-inline pv-profile-section__text-truncate-toggle link link-without-hover-state']"
        )
        browser.execute_script("arguments[0].click();",
                               volunteer_expand_button)
    except Exception as e:
        # print("volunteer_expand_button Exception:", e)
        pass

    #use beautiful soup for html parsing
    src = browser.page_source
    soup = BeautifulSoup(src, 'lxml')

    #BASIC INFO LIST
    basic_info_list = []

    name_div = soup.find('div', {'class': 'flex-1 mr5'})
    name_loc = name_div.find_all('ul')
    fullname = name_loc[0].find('li').get_text().strip()
    try:
        first_name, last_name = fullname.split()
    #above statement fails when a person has put their name as firstname, middlename, lastname
    except:
        first_name, middle_name, last_name = fullname.split()

    basic_info_list.append(first_name)
    basic_info_list.append(last_name)

    headline = name_div.find('h2').get_text().strip()
    basic_info_list.append(headline)
    basic_info_list.append(link)

    #appending empty strings for email_id, phone_number, age and github_link
    basic_info_list.append('')
    basic_info_list.append('')
    basic_info_list.append('')
    basic_info_list.append('')

    #print(basic_info_list)

    #education section
    education_info_list = []
    try:
        edu_section = soup.find('section', {
            'id': 'education-section'
        }).find('ul')
        edu_section = edu_section.find_all(
            'div', {
                'class':
                'pv-entity__summary-info pv-entity__summary-info--background-section'
            })
        college_names = []
        degree_names = []
        field_names = []
        grades = []
        dates = []
        for x in range(len(edu_section)):
            curr_section = edu_section[x]
            try:
                college_name = curr_section.find(
                    'h3',
                    {'class': 'pv-entity__school-name t-16 t-black t-bold'})
                college_names.append(college_name.get_text())
            except Exception as e:
                #print("Education college_name Exception",e)
                college_names.append('')

            try:
                degree_name = curr_section.find(
                    'p', {
                        'class':
                        'pv-entity__secondary-title pv-entity__degree-name t-14 t-black t-normal'
                    }).find('span', {'class': 'pv-entity__comma-item'})
                degree_names.append(degree_name.get_text())
            except Exception as e:
                #print("Education degree_name Exception",e)
                degree_names.append('')

            try:
                field_name = curr_section.find(
                    'p', {
                        'class':
                        'pv-entity__secondary-title pv-entity__fos t-14 t-black t-normal'
                    }).find('span', {'class': 'pv-entity__comma-item'})
                field_names.append(field_name.get_text())
            except Exception as e:
                #print("Education field_name Exception",e)
                field_names.append('')

            try:
                grade = curr_section.find(
                    'p', {
                        'class':
                        'pv-entity__secondary-title pv-entity__grade t-14 t-black t-normal'
                    }).find('span', {'class': 'pv-entity__comma-item'})
                grades.append(grade.get_text())
            except Exception as e:
                #print("Education grade Exception",e)
                grades.append('')

            try:
                time = curr_section.find(
                    'p',
                    {'class': 'pv-entity__dates t-14 t-black--light t-normal'})
                dates.append((time.find_all('time')[1].get_text()))
            except Exception as e:
                #print("Education time Exception",e)
                dates.append('')

        for i in range(len(edu_section)):
            education_info_list.append([
                college_names[i], degree_names[i], field_names[i], dates[i],
                grades[i]
            ])
    except Exception as e:
        #no education added
        #print("Education Section Exception", e)
        pass

    #print(education_info_list)

    #Project Section
    projects_info_list = []
    project_titles = []
    try:
        project_section = soup.find('div',
                                    {'id': 'projects-expandable-content'})
        project_section = project_section.find(
            'ul', {'class': 'pv-accomplishments-block__list'})

        projects = project_section.find_all(
            'h4', {'class': 'pv-accomplishment-entity__title t-14 t-bold'})

        for i in range(len(projects)):
            project_name = projects[i].get_text().split('\n')[2]
            project_name = re.sub(' +', ' ', project_name)
            project_titles.append(project_name.strip())

        projects = project_section.find_all(
            'p', {'class': 'pv-accomplishment-entity__date t-14'})
        project_time = []
        for i in range(len(project_titles)):
            try:
                project_date = projects[i].get_text().split('\n')[1]
                project_date = re.sub(' +', ' ', project_date)
                project_time.append(project_date[1:])
            except Exception as e:
                # print("project_date Exception", e)
                project_time.append('')

        project_descriptions = []
        projects2 = project_section.find_all(
            'p', {'class': 'pv-accomplishment-entity__description t-14'})
        for i in range(len(project_titles)):
            try:
                next_empty_elem = projects2[i].findNext('div')
                curr_proj_desc = next_empty_elem.next_sibling
                project_descriptions.append(curr_proj_desc.strip())
            except Exception as e:
                # print("curr_proj_desc Exception", e)
                project_descriptions.append('')

        #Construct projects_info_list from above data
        for i in range(len(project_titles)):
            projects_info_list.append(
                [project_titles[i], project_time[i], project_descriptions[i]])
    except Exception as e:
        #no projects added
        #print("Project Section Exception", e)
        pass
    #print(projects_info_list)

    #certifications section
    certifications_info_list = []
    try:
        certificates_section = soup.find('section',
                                         {'id': 'certifications-section'})

        list_items = certificates_section.find(
            'ul', {
                'class':
                'pv-profile-section__section-info section-info pv-profile-section__section-info--has-more'
            })
    except Exception as e:
        #print("certificates_section Exception", e)
        pass
    try:
        if list_items is None:
            list_items = certificates_section.find(
                'ul', {
                    'class':
                    'pv-profile-section__section-info section-info pv-profile-section__section-info--has-no-more'
                })

        items = list_items.find_all(
            'li', {
                'class':
                'pv-profile-section__sortable-item pv-certification-entity ember-view'
            })
        cert_names_list = []
        cert_issuer_list = []
        cert_dates_list = []

        for i in range(len(items)):
            curr_cert_name = items[i].find('h3', {'class': 't-16 t-bold'})
            curr_cert_name = curr_cert_name.get_text().strip()
            cert_names_list.append(curr_cert_name)

            curr_issuer_name = items[i].find_all('p', {'class': 't-14'})[0]
            curr_issuer_name = curr_issuer_name.get_text().strip()
            curr_issuer_name = curr_issuer_name.replace(
                'Issuing authority\n', '')
            cert_issuer_list.append(curr_issuer_name)

            curr_cert_date = items[i].find_all('p', {'class': 't-14'})[1]
            curr_cert_date = curr_cert_date.get_text().strip()
            curr_cert_date = curr_cert_date.replace(
                'Issued date and, if applicable, expiration date of the certification or license\n',
                '').replace('No Expiration Date', '').replace('Issued ', '')
            cert_dates_list.append(curr_cert_date)

        #adding elements in certifications_info_list as per schema
        for i in range(len(cert_names_list)):
            certifications_info_list.append(
                [cert_names_list[i], cert_dates_list[i], cert_issuer_list[i]])

    except Exception as e:
        #no certificates added
        #print("Certificates Section Exception", e)
        pass

    #print(certifications_info_list)

    #Experience Section
    experience_info_list = []
    list_items = []
    items = []

    try:
        experience_section = soup.find('section',
                                       {'class': 'experience-section'})
        #print(experience_section)

        list_items = experience_section.find(
            'ul', {
                'class':
                'pv-profile-section__section-info section-info pv-profile-section__section-info--has-more'
            })
    except Exception as e:
        #print("experience_section Exception", e)
        pass

    try:
        if list_items is None:
            list_items = experience_section.find(
                'ul', {
                    'class':
                    'pv-profile-section__section-info section-info pv-profile-section__section-info--has-no-more'
                })

        items = list_items.find_all(
            'li', {
                'class':
                'pv-entity__position-group-pager pv-profile-section__list-item ember-view'
            })
        company_names_list = []
        position_list = []
        dates_employed_list = []
        description_list = []

        for i in range(len(items)):
            try:
                curr_name = items[i].find('p', {
                    'class':
                    'pv-entity__secondary-title t-14 t-black t-normal'
                })
                curr_name = curr_name.get_text().strip()
                curr_name = curr_name.split('\n')[0].strip()
                #print("1st currname", curr_name)
                company_names_list.append(curr_name)
            except Exception as e:
                #print("Experience curr_name Exception:", e)
                pass

            try:
                if curr_name is None:
                    curr_name = items[i].find('h3',
                                              {'class': 't-16 t-black t-bold'})
                    curr_name = curr_name.get_text().strip()
                    curr_name = curr_name.replace("Company Name\n", '')
                    company_names_list.append(curr_name)
            except Exception as e:
                #print("Experience curr_name Exception:", e)
                pass

            try:
                curr_position = items[i].find('h3',
                                              {'class': 't-16 t-black t-bold'})
                curr_position = curr_position.get_text().strip()
                curr_position = curr_position.replace("Company Name\n", '')
                position_list.append(curr_position)
            except Exception as e:
                #print("Experience curr_position Exception:", e)
                pass

            try:
                curr_dates = items[i].find('h4', {
                    'class':
                    'pv-entity__date-range t-14 t-black--light t-normal'
                })
                curr_dates = curr_dates.get_text().strip()
                curr_dates = curr_dates.replace('Dates Employed\n', '')
                dates_employed_list.append(curr_dates)
            except Exception as e:
                #print("Experience curr_dates Exception:", e)
                pass

            try:
                curr_description = items[i].find(
                    'div', {
                        'class':
                        'pv-entity__extra-details t-14 t-black--light ember-view'
                    })
                curr_description = curr_description.get_text().strip()
                curr_description = curr_description.replace(
                    '\n\n\n\n\n        see less', '')
                curr_description = curr_description.replace(
                    '\n\n   \n  \n\n\n\n\n\n\n\n\n\n', ' ')
                curr_description = curr_description.replace(
                    '\n\n    \n…\n\n        see more', '')
                curr_description = curr_description.replace('\n       ', '.')
                curr_description = curr_description.replace('\n\n', '.')
                description_list.append(curr_description)
            except Exception as e:
                #print("Experience curr_description Exception:", e)
                pass
                #Add empty description for normalization of data
                description_list.append('')

        #create company_names_list from above data
        for i in range(len(company_names_list)):
            experience_info_list.append([
                company_names_list[i], position_list[i],
                dates_employed_list[i], description_list[i]
            ])

    except Exception as e:
        #No Experience Added
        #print("Experience Section Exception:", e)
        pass
    #print(experience_info_list)

    #Skills Section
    skills_info_list = []
    try:
        skills_section = soup.find(
            'section', {
                'class':
                'pv-profile-section pv-skill-categories-section artdeco-container-card ember-view'
            })
    except Exception as e:
        #print("skills_section Exception", e)
        pass

    try:
        if skills_section is None:

            skills_section = soup.find(
                'section', {
                    'class':
                    'pv-profile-section pv-skill-categories-section artdeco-container-card artdeco-card ember-view'
                })

        all_skills = skills_section.find_all('span', {
            'class':
            'pv-skill-category-entity__name-text t-16 t-black t-bold'
        })
        #print(all_skills)

        for i in range(len(all_skills)):
            skills_info_list.append(all_skills[i].get_text().strip())
        print(skills_info_list)

    except Exception as e:
        #No skills added
        print("Skills Section Exception:", e)
        pass

    #Volunteering Section:
    volunteer_info_list = []
    items = []
    list_items = []
    try:
        volunteer_section = soup.find(
            'section',
            {'class': 'pv-profile-section volunteering-section ember-view'})
        list_items = volunteer_section.find(
            'ul', {
                'class':
                'pv-profile-section__section-info section-info pv-profile-section__section-info--has-more ember-view'
            })
    except Exception as e:
        #print("Volunteering volunteer_section Exception:", e)
        pass

    try:
        if list_items is None:
            list_items = volunteer_section.find(
                'ul', {
                    'class':
                    'pv-profile-section__section-info section-info pv-profile-section__section-info--has-no-more'
                })
    except Exception as e:
        #print("Volunteering list_items Exception:", e)
        pass

    try:
        items = list_items.find_all(
            'li', {
                'class':
                'pv-profile-section__sortable-item pv-profile-section__section-info-item relative pv-profile-section__sortable-item--v2 pv-profile-section__list-item sortable-item ember-view'
            })
    except Exception as e:
        # print("Volunteering list_items Exception:", e)
        pass

    try:
        if items == []:
            items = list_items.find_all(
                'li', {
                    'class':
                    'pv-profile-section__list-item pv-volunteering-entity pv-profile-section__card-item ember-view'
                })
    except Exception as e:
        # print("Volunteering items Exception:", e)
        pass

    try:
        for i in range(len(items)):
            curr_name = items[i].find('span',
                                      {'class': 'pv-entity__secondary-title'})
            curr_name = curr_name.get_text().strip()

            curr_role = items[i].find('h3', {'class': 't-16 t-black t-bold'})
            curr_role = curr_role.get_text().strip()

            try:
                curr_dates = items[i].find(
                    'h4', {
                        'class':
                        'pv-entity__date-range detail-facet inline-block t-14 t-black--light t-normal'
                    })
                curr_dates = curr_dates.get_text().strip()
                curr_dates = curr_dates.replace('Dates volunteered\n', '')
            except Exception as e:
                #print("curr_dates Exception", e)
                curr_dates = ''

            try:
                curr_description = items[i].find(
                    'p', {'class': 'pv-entity__description t-14 t-normal mt4'})
                curr_description = curr_description.get_text().strip()
            except Exception as e:
                #print("curr_description Exception", e)
                curr_description = ''

            #Construct volunteer_info_list from above data
            volunteer_info_list.append(
                [curr_name, curr_role, curr_dates, curr_description])

    except Exception as e:
        #no volunteering added
        #print("Volunteering Section Exception", e)
        pass

    try:
        # click to expand honors and awards section because only either projects or honors and awards can be expanded at a time
        honors_and_awards_expand_button = browser.find_element_by_xpath(
            "//section[@class='pv-profile-section pv-accomplishments-section artdeco-container-card ember-view']//button[@aria-label='Expand honors & awards section']"
        )
        browser.execute_script("arguments[0].click();",
                               honors_and_awards_expand_button)

        # click to expand honors and awards section to show more
        honors_and_awards_expand_button2 = browser.find_element_by_xpath(
            "//section[@class='pv-profile-section pv-accomplishments-section artdeco-container-card ember-view']//button[@aria-controls='honors-expandable-content' and @aria-expanded='false']"
        )
        browser.execute_script("arguments[0].click();",
                               honors_and_awards_expand_button2)
    except Exception as e:
        #print("honors_and_awards_expand_button Exception", e)
        pass

    #accomplishments section
    accomplishments_info_list = []
    try:
        accomplishments_section = soup.find_all(
            'section', {
                'class':
                'pv-profile-section pv-accomplishments-section artdeco-container-card ember-view'
            })

        honors_section = accomplishments_section[0].find(
            'div', {'aria-labelledby': 'honors-title'})

        list_items = honors_section.find_all(
            'li', {'class': 'pv-accomplishments-block__summary-list-item'})

        for i in range(len(list_items)):
            # appending empty string for year field
            accomplishments_info_list.append(
                [list_items[i].get_text().strip(), ""])

    except Exception as e:
        #No accomplishments added
        #print("Accomplishments Section Exception", e)
        pass

    #empty hobbies_info_list because it is not available on linkedin
    hobbies_info_list = []

    #Close the browser once scraping is done
    browser.close()

    #TESTING OUTPUTS
    #print("LISTS")
    #print(basic_info_list)
    #print(education_info_list)
    #print(projects_info_list)
    #print(certifications_info_list)
    #print(experience_info_list)
    #print(skills_info_list)
    #print(volunteer_info_list)
    #print(accomplishments_info_list)

    final_all_lists = [
        basic_info_list, education_info_list, projects_info_list,
        certifications_info_list, experience_info_list, skills_info_list,
        volunteer_info_list, accomplishments_info_list, hobbies_info_list
    ]

    json_data = {
        'basic_info_list': basic_info_list,
        'education_info_list': education_info_list,
        'projects_info_list': projects_info_list,
        'certifications_info_list': certifications_info_list,
        'experience_info_list': experience_info_list,
        'skills_info_list': skills_info_list,
        'volunteer_info_list': volunteer_info_list,
        'accomplishments_info_list': accomplishments_info_list,
        'hobbies_info_list': hobbies_info_list
    }

    final_json_string = json.dumps(json_data)
    #print(final_json_string)

    fileheader = open("static/test.json", 'w')

    fileheader.writelines(final_json_string)
    fileheader = open("output/test.json", 'w')

    fileheader.writelines(final_json_string)

    return json_data
コード例 #7
0
def GetEducation(soup):

    education_info_list = []
    try:
        edu_section = soup.find('section', {
            'id': 'education-section'
        }).find('ul')
        edu_section = edu_section.find_all(
            'li', {
                'class':
                "pv-profile-section__sortable-item pv-profile-section__section-info-item relative pv-profile-section__sortable-item--v2 pv-profile-section__list-item sortable-item ember-view"
            })
        college_names = []
        degree_names = []
        major_names = []
        grades = []
        dates = []
        for x in range(len(edu_section)):
            curr_section = edu_section[x]
            try:
                college_name = curr_section.find(
                    'h3',
                    {'class': 'pv-entity__school-name t-16 t-black t-bold'})
                college_names.append(college_name.get_text())
            except:
                college_names.append('')

            try:
                degree_name = curr_section.find(
                    'p', {
                        'class':
                        'pv-entity__secondary-title pv-entity__degree-name t-14 t-black t-normal'
                    }).find('span', {'class': 'pv-entity__comma-item'})
                degree_names.append(degree_name.get_text())
            except:
                degree_names.append('')

            try:
                major_name = curr_section.find(
                    'p', {
                        'class':
                        'pv-entity__secondary-title pv-entity__fos t-14 t-black t-normal'
                    }).find('span', {'class': 'pv-entity__comma-item'})
                major_names.append(major_name.get_text())
            except:
                major_names.append('')

            try:
                grade = curr_section.find(
                    'p', {
                        'class':
                        'pv-entity__secondary-title pv-entity__grade t-14 t-black t-normal'
                    }).find('span', {'class': 'pv-entity__comma-item'})
                grades.append(grade.get_text())
            except:
                grades.append('')

            try:
                time = curr_section.find(
                    'p',
                    {'class': 'pv-entity__dates t-14 t-black--light t-normal'})
                dates.append((time.find_all('time')[1].get_text()))
            except:
                dates.append('')

        for i in range(len(edu_section)):
            education_info_list.append([
                college_names[i], degree_names[i], major_names[i], dates[i],
                grades[i]
            ])
    except:
        # If no education added
        pass

    #print(education_info_list)
    return education_info_list
コード例 #8
0
ファイル: twwiter.py プロジェクト: cdhao123/Etong
url = 'https://twitter.com/realDonaldTrump'
chromedriver_path = "C:\\Program Files\\Python37\\chromedriver.exe"  #驱动链接
options = webdriver.ChromeOptions()
# 此步骤很重要,设置为开发者模式,防止被各大网站识别出来使用了Selenium
options.add_experimental_option('excludeSwitches', ['enable-automation'])
driver = webdriver.Chrome(executable_path=chromedriver_path, options=options)
driver.get(url)
js = "var q=document.documentElement.scrollTop="
for i in range(100):
    driver.execute_script(js + str(10000 * (i + 1)))
    time.sleep(0.5)
page_source = driver.page_source
html = BeautifulSoup(page_source, 'html.parser')
tweets = html.find_all('div', class_="content")
text = ''
for tweet in tweets:
    try:
        time = tweet.find('small', class_="time")
        time = time.find_all('span')[0].text
        tweet = tweet.find('p')
        print(time)
        print(tweet.text.strip())
        text += tweet.text.strip()
        print('-----------------------------------------------')
    except:
        pass
wordcloud = WordCloud(scale=16, background_color='white').generate(text)
image_produce = wordcloud.to_image()
image_produce.show()
driver.close()
コード例 #9
0
def scrap_movie_details(movie_url):
    id1 = movie_url[27:-1]
    file_name = id1 + ".json"
    if os.path.isfile(file_name):
        with open(file_name, "r") as file1:
            read = file1.read()
            data = json.loads(read)
        return (data)
    link = requests.get(movie_url)  #call api
    soup = BeautifulSoup(
        link.text,
        "html.parser")  #convert to html form using BeautifulSoup module

    name_tag = soup.find(
        "div", class_="title_block")  #find movie name and poster using html
    name = name_tag.find("h1").text[:-8]
    poster_tag = soup.find("div", class_="poster")
    poster = poster_tag.find("img")["src"]

    a = []
    director = []
    language = []
    country = []
    time = soup.find("div", class_="subtext")
    run = time.find("time").text.strip()
    if run[1] == "h" and run[-3:] == "min":
        runtime = int(run[0]) * 60 + (int(
            run[3:-3]))  #find runtime using slice and hour to convert into min
    else:
        runtime = run[0:-3]
        print(runtime)
    for j in time:
        if j in time.find_all("a"):  #find which type or base of movie list
            genre_list = j.text
            a.append(genre_list)
    genre = a  #pop last element of the genre list
    genre.pop()

    summary = soup.find("div", class_="plot_summary")
    bio = summary.find(
        "div",
        class_="summary_text").text.strip()  #find bio of discription of movie

    direct_tag = summary.find("div", class_="credit_summary_item"
                              )  #find all director in a movie using find_all
    direct = direct_tag.find_all("a")
    for i in direct:
        director.append(i.text)

    article = soup.find("div", {'class': "article", 'id': "titleDetails"})
    txt_block = article.find_all("div", class_="txt-block")
    for m in txt_block:
        b = m.find("h4",
                   class_="inline")  #find countary and all langauage in a list
        if (b != None):
            if b.text == "Country:":
                Country = m.find("a").text
            if b.text == "Language:":
                Language = m.find_all("a")
                for i in Language:
                    language.append(i.text)
    abc = scrap_movie_cast(movie_url)

    all_details = {  #all data is convert into dictionary
        "name": name,
        "runtime": runtime,
        "genre": genre,
        "bio": bio,
        "Country": Country,
        "poster": poster,
        "director": director,
        "language": language,
        "cast": abc
    }
    print(all_details)

    with open(file_name, "w") as file1:
        json.dump(all_details, file1, indent=4)
    return (all_details)
コード例 #10
0
ファイル: spider_xueshu.py プロジェクト: lif54334/tools_box
def get_urls(text):
    all_titles = []  # 主题
    all_abstracts = []  # 摘要
    all_authors = []  # 作者
    all_paper_urls = []  # 论文初步网址
    all_publish = []  # 论文来源
    all_time = []  # 论文时间

    soup = BeautifulSoup(text, 'lxml')

    title_datas = soup.select(
        'div.sc_content > h3 > a')  # select返回值类型为<class 'list'>

    author_datas = soup.find_all(
        'div', 'sc_info')  # find_all返回值类型为<class 'bs4.element.ResultSet'>

    abstract_datas = soup.find_all('div', 'c_abstract')

    publish_datas = soup.find_all('div', 'sc_info')

    time_datas = soup.find_all('div', 'sc_info')
    for item in title_datas:
        result = {
            'title': item.get_text(),
            'href': item.get('href')  # 关于论文的详细网址,经过观察发现需要提取部分内容
            # http://xueshu.baidu.com/usercenter/paper/show?paperid=389ef371e5dae36e3a05b187f7eb2a95&site=xueshu_se
            # /s?wd=paperuri%3A%28389ef371e5dae36e3a05b187f7eb2a95%29&filter=sc_long_sign&sc_ks_para=q%3D%E6%B7%B1%E5%BA%A6%E5%AD%A6%E4%B9%A0%E7%A0%94%E7%A9%B6%E7%BB%BC%E8%BF%B0&sc_us=11073893925633194305&tn=SE_baiduxueshu_c1gjeupa&ie=utf-8
        }
        all_titles.append(item.get_text())
        wd = str(parse.urlparse(item.get('href')).query).split('&')[0]
        paperid = wd.split('%')[2][2:]
        params = {'paperid': paperid, 'site': 'xueshu_se'}
        url = 'http://xueshu.baidu.com/usercenter/paper/show?' + urlencode(
            params)
        all_paper_urls.append(url)
        # print(url)
        # print(result)

    for abs in abstract_datas:  # abs类型是<class 'bs4.element.Tag'>
        str_list = []
        for l in abs.contents:  # l的类型是<class 'bs4.element.NavigableString'>
            if str(l).replace('\n', '').strip():
                str_list.append(str(l).replace('\n', '').strip())
            else:
                str_list.append("unknown")
        # print("".join(str_list).replace('<em>','').replace('</em>',''))
        all_abstracts.append("".join(str_list).replace('<em>', '').replace(
            '</em>', ''))

    for authors in author_datas:  # authors类型为<class 'bs4.element.Tag'>
        for span in authors.find_all(
                'span', limit=1):  # 此时span类型为<class 'bs4.element.Tag'>
            each_authors = []
            for alist in span.find_all('a'):
                if alist.string is None:
                    each_authors.append("unknown")
                else:
                    each_authors.append(alist.string)
            all_authors.append(each_authors)

    for publish in publish_datas:  # authors类型为<class 'bs4.element.Tag'>
        each_publish = []
        spans = publish.find_all('span')  # 此时span类型为<class 'bs4.element.Tag'>
        spans = str(spans)
        try:
            publish_name = re.search(r'《.*》', spans)
            publish_name = publish_name.group()
        except:
            publish_name = "unknown"

        each_publish.append(publish_name)
        all_publish.append(each_publish)

    for time in time_datas:  # authors类型为<class 'bs4.element.Tag'>
        each_time = []
        for span in time.find_all(
                'span',
            {"class": "sc_time"}):  # 此时span类型为<class 'bs4.element.Tag'>
            time_name = "unknown"
            for alist in span:
                try:
                    alist.string = ((alist.string).strip())
                    time_name = alist.string
                except:
                    time_name = "unknown"
            each_time.append(time_name)
        all_time.append(each_time)

    return all_titles, all_authors, all_abstracts, all_paper_urls, all_publish, all_time
コード例 #11
0
ファイル: task9.py プロジェクト: madhu20336/web-scraping
def movie_details():
    global time
    global serial_no
    time_limit = random.randint(1, 3)
    imdb_api = "https://www.imdb.com/india/top-rated-indian-movies/?ref_=nv_mv_250_in"
    # time.sleep(time_limit)
    imdb_url = requests.get(imdb_api)
    data = imdb_url.json
    soup = BeautifulSoup(imdb_url.text, "html.parser")
    dict_1 = {}

    div = soup.find("div", class_="lister")
    body = div.find("tbody", class_="lister-list")
    name = body.find_all("tr")
    for tr in name:
        list_1 = []
        director = []
        language = []
        genre = []
        movie_name = tr.find("td", class_="titleColumn").a.get_text()
        rating = tr.find("td",
                         class_="ratingColumn imdbRating").strong.get_text()
        years = tr.find("td", class_="titleColumn").span.get_text()
        link = tr.find("td", class_="titleColumn").a['href']
        url = 'https://www.imdb.com' + str(link)
        serial_no += 1
        # time.sleep(time_limit)
        movie_url = requests.get(url)
        soup = BeautifulSoup(movie_url.text, "html.parser")
        director_name = soup.find("div",
                                  class_="credit_summary_item").a.get_text()
        director.append(director_name)
        movies_poster = soup.find("div", class_="poster").a['href']
        movie_poster = "https://www.imdb.com/" + movies_poster
        bio = soup.find("div", class_="plot_summary")
        movie_bio = bio.find("div", class_="summary_text").get_text().strip()
        detail = soup.find("div",
                           attrs={
                               "class": "article",
                               "id": "titleDetails"
                           })
        div1 = detail.find_all("div")
        for i in div1:
            run = i.find_all("h4")
            for j in run:
                if "Language:" in j:
                    lan = i.find_all("a")
                    for lang_uage in lan:
                        movie_language = lang_uage.get_text()
                        language.append(movie_language)
        time = soup.find("div", class_="subtext")
        runtime = time.find("time").get_text().strip()
        hour_to_min = (int(runtime[0])) * 60
        i = 0
        mins = ""
        a = runtime[3:]
        while i < len(a):
            if a[i] == "m":
                break

            mins = mins + a[i]
            i = i + 1
        runtime_of_movie = hour_to_min + int(mins)
        movie_genre = time.find_all("a")
        movie_genre.pop()
        for i in movie_genre:
            genre_1 = i.get_text()
            genre.append(genre_1)
        list1 = []
        api_cast = url
        url_cast = requests.get(api_cast)
        soup = BeautifulSoup(url_cast.text, "html.parser")
        table = soup.find("table", "cast_list")
        td = table.find_all("td", class_="")
        for i in td:
            my_dict = {}
            id = i.a["href"][6:15]
            artist = i.a.get_text().strip()
            my_dict["artist"] = artist
            my_dict["imbd_id"] = id
            list1.append(my_dict)
        dict_1["movie_name"] = movie_name
        dict_1["year"] = int(years[1:5])
        dict_1["rating"] = rating
        dict_1["position"] = int(serial_no)
        dict_1["url"] = url
        dict_1["director"] = director
        dict_1["country"] = "India"
        dict_1["poster_url"] = movie_poster
        dict_1["language"] = language
        dict_1["movie_bio"] = movie_bio
        dict_1["runtime"] = runtime_of_movie
        dict_1["movie_genre"] = genre
        dict_1["cast"] = list1
        list_1.append(dict_1)
        link = link[7:16]
        print(link)

        with open("movie/" + link + ".json", "w") as Data:
            json.dump(list_1, Data, indent=4)
    return (list_1)