Exemple #1
0
 def updateDB(self, exe_ID):
     try:
         sQuery = """ UPDATE   [LinkedIn_Experience_Header] SET [Is_Crawl] = 1 WHERE [Exp_Header_Id] = ? """
         value = [exe_ID]
         MyConnection.insertUpdateDB(sQuery, value)
         print("UPDATE DONE ! ")
     except Exception as e:
         print("UPDATE ERROR ! ", e)
Exemple #2
0
 def insertDB(self, key, tag, exe_ID, URL):
     try:
         sQuery = """ INSERT INTO [dbo].[Linkedin_Experience_Tag]
         ([Exp_Header_Id]
         ,[URL]
         ,[Tag]
         ,[Tag_Type])
     VALUES (?,?,?,?)"""
         value = [exe_ID, URL, key, tag]
         MyConnection.insertUpdateDB(sQuery, value)
         print("INSERT DONE ! ")
     except Exception as e:
         print("INSERT ERROR ! ", e)
 def updateSpiderStatus(self, flag):
     # Disable !
     if (int(flag) == 0):
         squery = """UPDATE [Spider_Status] SET [Status] = 0 , [Last_Page_Crawl] =? 
         , [Last_Stop]= ? , [Last_Start] = ? , [URL] = ?  WHERE [Row_ID] = ? """
         value = [
             "",
             datetime.now(), self.timeStart, self.current_URL,
             self.Spider_ID
         ]
         MyConnection.insertUpdateDB(squery, value)
     # Enable !
     else:
         squery = """UPDATE [Spider_Status] SET [Status] = 1, [URL] = ?  WHERE [Row_ID] = ? """
         value = [self.current_URL, self.Spider_ID]
         MyConnection.insertUpdateDB(squery, value)
         self.timeStart = datetime.now()
Exemple #4
0
    def main(self):
        try:
            sQuery = """   SELECT  DISTINCT "URL"  ,"Head_Line", "Schools ",
            "Sumary", "Skill", "Experiences"   from "Linkedin_Detail" where "Is_Picked" is  null """
            conn = MyConnection.getConnection()
            cursor = conn.cursor()
            cursor.execute(sQuery)
            result = cursor.fetchone()
            HeadLine = ""
            Schools = ""
            Sumary = ""
            Skill = ""
            Experiences = ""

            while result:
                url = result[0]
                HeadLine = result[1]
                Schools = result[2]
                # Sumary = result[3]
                Skill = result[4]
                Experiences = result[5]

                ## GET ALL DETAIL !

                li_Schools = Schools.split("---BREAK---")
                li_Sumary = Sumary.split("---BREAK---")
                li_Skill = Skill.split("---BREAK---")
                li_Experiences = Experiences.split("---BREAK---")

                ## Solve HeadLine !
                HeadLine = str(HeadLine).upper()
                # Kiểm tra trong Headline có từ "AT" không  ?
                li_Headline = HeadLine.split()
                if ("AT" in li_Headline):
                    index = int(li_Headline.index("AT"))
                    HeadLine = ' '.join(li_Headline[0:(index - 1)])

            ## Solve Company !
                for x in li_Schools:
                    try:
                        temp = x.split("\n")
                        school_Name = temp[0]
                        title = temp[2]
                        duration = temp[6]
                        field_Study = temp[4]
                        active = temp[7].replace('Activities and Societies:',
                                                 '')

                        sQuery = """ INSERT INTO public."Linkedin_School"(
                        "School_Name", "Secondary_Title", "Date_Attended", 
                        "Activitive", "Linkedin_URL", "Field_Study")
                        VALUES (?, ?, ?, ?, ?, ?) """
                        value = [
                            school_Name, title, duration, active, url,
                            field_Study
                        ]

                        MyConnection.insertUpdateDB(sQuery, value)
                        print("INSERT SCHOOL DONE ! ")

                    except Exception as e:
                        print("Pick Schools ERROR ", e)

            # Solving Experiences !
                for x in li_Experiences:
                    try:
                        temp = x.split("\n")
                        position = temp[0]
                        company_Name = temp[2]
                        date_Empl = temp[4]
                        empl_Duration = temp[6]

                        location = ""
                        desctiption = ""
                        if (7 < len(temp)):
                            location = temp[8]
                            desctiption = ' '.join(temp[9:(len(temp))])
                            print(desctiption)

                        sQuery = """INSERT INTO public."Linkedin_Experiences"(
                            "Position", "Company_Name", "Date_Employee", "Employee_Duration", "Location", 
                            "Description", "URL")
                            VALUES (?, ?, ?, ?, ?, ?, ?) """
                        value = [
                            position, company_Name, date_Empl, empl_Duration,
                            location, desctiption, url
                        ]

                        MyConnection.insertUpdateDB(sQuery, value)
                        print("INSERT EXPERIENCES DONE ! ")
                    except Exception as e:
                        print("Pick EXPERIENCES ERROR ", e)

            # Solving Skill !
                for x in li_Skill:
                    try:
                        temp = x.split("\n")
                        for i in temp:
                            sQuery = """INSERT INTO public."Linkedin_Skill"(
                            "Skill", "URL")
                            VALUES (?,?)"""
                            value = [i.strip(), url]

                            MyConnection.insertUpdateDB(sQuery, value)
                            print("INSERT SKILL DONE ! ")
                    except Exception as e:
                        print("INSERT SKILL ERROR ! ", e)

            ## UPDATE Is_Picked for thí URL !
                try:
                    sQuery = """  UPDATE "Linkedin_Detail" SET "Is_Picked" = '1'  WHERE "URL" =  ?"""
                    value = [url]
                    MyConnection.insertUpdateDB(sQuery, value)
                    print("Update DONE ! ")
                except Exception as e:
                    print("UPDATE ERROR ! ", e)
                result = cursor.fetchone()
            conn.close()

        except Exception as e:
            print("ERROR ! ", e)
    def main(self):
        try:
            sQuery = """ SELECT DISTINCT [URL] , [Skill] , [Row_ID], [Schools ]
            FROM [DB_LINKEDIN].[dbo].[Linkedin_Detail]
            WHERE  Row_ID = '1523' """
            conn = MyConnection.getConnection()
            cursor = conn.cursor()
            cursor.execute(sQuery)
            result = cursor.fetchone()

            while result:
                url = result[0]
                skill = result[1]
                profile_ID = result[2]
                education = result[3]
                ## GET ALL  Experiences DETAIL !

                li_Skill = skill.split('---BREAK---')
                li_Edu = education.split('---BREAK---')

                ## Solve Skill  !
                for x in li_Skill:
                    if (len(x) > 2 and None != x and '' != x):
                        try:
                            sQuery = """INSERT INTO [dbo].[Linkedin_Skill_Header]
                                    ([Profile_ID]
                                    ,[URL]
                                    ,[Skill_Name])
                                VALUES(?,?,?) """
                            value = [profile_ID, url, x]
                            MyConnection.insertUpdateDB(sQuery, value)
                            print("INSERT SKILL DONE ! ")
                        except Exception as e:
                            print("INSERT SKILL ERROR ! ", e)

            ## Solve Education  !
                for e in li_Edu:
                    if (len(e) > 2 and None != e and '' != e):
                        try:
                            temp = e.split('\n')
                            school_Name = temp[0]
                            Degree_Name = temp[2]
                            field_Of_Study = temp[4]
                            dates_Attended = temp[6]
                            description = ' '.join(temp[7:len(temp)])

                            sQuery = """INSERT INTO [dbo].[Linkedin_Education_Header]
                                ([Profile_ID]
                                ,[URL]
                                ,[School_Name]
                                ,[Degree_Name]
                                ,[Field_Of_Study]
                                ,[Dates_Attended_Or_Expected_graduation]
                                ,[Education_Description])
                            VALUES (?,?,?,?,?,?,?) """
                            value = [
                                profile_ID, url, school_Name, Degree_Name,
                                field_Of_Study, dates_Attended, description
                            ]
                            MyConnection.insertUpdateDB(sQuery, value)

                            print("INSERT EDU DONE ! ")
                        except Exception as e:
                            print("INSERT EDU ERROR ! ", e)

                result = cursor.fetchone()
            conn.close()

        except Exception as e:
            print("ERROR ! ", e)
Exemple #6
0
    def main(self):
        try:
            # sQuery = """ SELECT DISTINCT [URL] , [Experiences] , [Location], [Avatar_URL], [Row_ID]
            # FROM [DB_LINKEDIN].[dbo].[Linkedin_Detail]
            # WHERE  Row_ID = '1507' """

            sQuery = """SELECT DISTINCT [URL] , [Experiences] , [Location], [Avatar_URL], [Row_ID]
            FROM [DB_LINKEDIN].[dbo].[Linkedin_Detail]
            WHERE    len (Experiences) > 5 and Row_ID < 1630  AND [Is_Tag] IS NULL """
            conn = MyConnection.getConnection()
            cursor = conn.cursor()
            cursor.execute(sQuery)
            result = cursor.fetchone()

            while result:
                temp_Experiences = result[1]
                location = result[2]
                url = result[0]
                avatar = result[3]
                profile_ID = result[4]
                ## GET ALL  Experiences DETAIL !
                li_Experiences = temp_Experiences.split('---BREAK---')

                ## Solve HeadLine !
                for e in li_Experiences:
                    temp_Element = e.split('\n')

                    try:
                        position = temp_Element[0]
                        company_Name = temp_Element[2]
                        date_Employee = temp_Element[4]
                        employee_Duration = temp_Element[6]
                        location = temp_Element[8]
                        description_Exp = ' '.join(
                            temp_Element[9:(len(temp_Element))])

                        sQuery = """ INSERT INTO [dbo].[LinkedIn_Experience_Header]
                                ( [URL]
                                ,[Position]
                                ,[Company_Name]
                                ,[Dates_Employed]
                                ,[Employment_Duration]
                                ,[Location]
                                ,[Experience_Description]
                                , [Profile_Id])
                            VALUES (?,?,?,?,?,?,?,?) """
                        value = [
                            url, position, company_Name, date_Employee,
                            employee_Duration, location, description_Exp,
                            profile_ID
                        ]
                        MyConnection.insertUpdateDB(sQuery, value)
                        print("INSERT DONE ! ")
                    except Exception as e:
                        print("NOT SAME PATTERN ! ", e)
                # UPDATE   !
                try:
                    sQuery = """ UPDATE [Linkedin_Detail]  SET Is_Tag = 1 WHERE  [Row_ID] = ? """
                    value = [profile_ID]
                    MyConnection.insertUpdateDB(sQuery, value)
                    print("UPDATE DONE ")
                except Exception as e:
                    print("UPDATE ERROR ! ", e)
                result = cursor.fetchone()
            conn.close()

        except Exception as e:
            print("ERROR ! ", e)
    def main(self, country, start, end, ID):
        try:

            ## GET ALL COMPANY NAME TO  SEARCH  !
            listKey = {}
            connection = MyConnection.getConnection()
            cursor = connection.cursor()

            ## GET COMMPANY NAME  USED TO SEARCH WITH BING!!
            SQLCommand = """SELECT DISTINCT [D-U-N-S] , [Company_Name_Clean] FROM [Company] WHERE
             [Company_Name_Clean] is not null  AND [Is_Crawl] is NULL and [Row_ID] >= ? and [Row_ID]  < ?  """
            value = [start, end]
            cursor.execute(SQLCommand, value)
            results = cursor.fetchone()
            while results:
                DUNS_ID = int(results[0])  # this is DUNS Number this comapany
                listKey[DUNS_ID] = results[1]  # Name of Company
                results = cursor.fetchone()
            connection.close()
            print("--------------> Total Size : ", len(listKey))
            ## Config for browser do not open web browser !
            options = webdriver.ChromeOptions()
            options.add_argument('headless')
            browser = webdriver.Chrome(chrome_options=options)

            browser.get("https://www.bing.com/")
            time.sleep(4)

            for duns in listKey.keys():

                time.sleep(randint(8, 15))

                print("Company Search : ", listKey[duns])
                keys = " \"VietNam \" " + " site: linkedin.com/in " + "\"" + listKey[
                    duns] + " \""
                KeySearch = browser.find_element_by_xpath(
                    "//*[@class='b_searchbox']")
                KeySearch.clear()
                KeySearch.send_keys(keys)  # Truyền tên cty vào để search
                time.sleep(randint(2, 6))
                # KeySearch.send_keys(u'\ue007') # Enter search !
                btnSearch = browser.find_element_by_xpath(
                    "//*[@id='sb_form_go']").click()
                sleep = int(randint(3, 15))
                time.sleep(sleep)

                try:
                    allRow = browser.find_elements_by_xpath(
                        "//*[@class='b_algo']")
                    count_2 = 0
                    if (0 < len(allRow)):
                        while True:
                            timeSpleep = randint(
                                5, 10)  #  Random delay time from  3 - 20s
                            print("TIME DELAY : ", timeSpleep)
                            time.sleep(timeSpleep)
                            if (50 == count_2):
                                time.sleep(40)
                                count_2 = 0
                            allRow = browser.find_elements_by_xpath(
                                "//*[@class='b_algo']")
                            browser.execute_script(
                                "window.scrollTo(0, document.body.scrollHeight);"
                            )  #kéo thanh cuộn xuống .
                            print("--------> Page Size : ", len(allRow))
                            for x in allRow:
                                txt = x.text
                                temp = txt.split()
                                findString = txt.find("LinkedIn")
                                urls = re.findall(
                                    'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
                                    x.text)
                                companyName = txt[0:int(findString)]
                                urls = ''.join(urls)
                                print(urls)
                                print(
                                    "--------------------------------------------------"
                                )

                                # Save to DB !
                                try:
                                    command = """INSERT INTO  [Linkedin_URL]
                                    ([Linkedin_Name]
                                    ,[Linkedin_URL]
                                    ,[DUNS_NUMBER]
                                    ,[Linkedin_Type]
                                    ,[Country])
                                VALUES (?,?,?,?,? )"""
                                    value = [
                                        companyName, urls, duns, "Profile",
                                        country
                                    ]
                                    MyConnection.insertUpdateDB(command, value)
                                    print("INSERT DONE ! ")
                                    count_2 += 1
                                except Exception as e:
                                    print("INSERT CRAWL ERROR : ", e)
                            browser.find_element_by_xpath(
                                "//*[@class='sb_pagN']").click()  # NExt page!
                            time.sleep(3)
                except Exception as e:
                    print("ERROR  WHEN CRAWLING ! ", e)

                # xác nhận đã search với keyword đó rồi :
                try:
                    command = """UPDATE [Company]  SET [Is_Crawl] = 1 WHERE  [D-U-N-S] = ? """
                    value = [duns]
                    MyConnection.insertUpdateDB(command, value)
                    print("UPDATE Crawl DONE ! ")
                    time.sleep(2)
                except Exception as e:
                    print("UPDATE Crawl Error : ", e)
        except Exception as e:
            print("ERROR IS : ", e)
    def main(self):
        try:

            self.getListURL()
            ## Config for browser do not open web browser !
            # options = webdriver.ChromeOptions()
            browser = webdriver.Chrome()
            # options = webdriver.ChromeOptions()
            # options.add_argument('headless')
            # browser = webdriver.Chrome(chrome_options=options)

            browser.get("https://www.google.com")
            time.sleep(20)
            browser.get("https://www.linkedin.com")
            # time.sleep(60)
            username = browser.find_element_by_xpath(
                "//*[@class='login-email']")
            password = browser.find_element_by_xpath(
                "//*[@class='login-password']")
            username.send_keys("*****@*****.**")
            password.send_keys("duybaoo19")
            time.sleep(4)
            browser.find_element_by_xpath(
                "//*[@class='login submit-button']").click()
            time.sleep(randint(20, 40))

            # GET DEATIL FOR EACH URL IN listURL !
            for URL in self.listURL:
                try:
                    browser.get(URL)
                    # browser.get("https://www.linkedin.com/in/vomanhtoan/")
                    print(URL)
                    time.sleep(randint(5, 20))

                    # Show information profile !
                    try:
                        browser.find_element_by_xpath(
                            "//*[@class='contact-see-more-less link-without-visited-state']"
                        ).click()
                    except Exception as e:
                        print(e)
                    time.sleep(3)
                    # KHAI BAO BIEN
                    name = ""
                    headLine = ""
                    company = ""
                    location = ""
                    connections = ""
                    summary = ""
                    name = ""
                    headLine = ""
                    company = ""
                    school = ""
                    phone = ""
                    email = ""
                    connected_Time = ""
                    experiences = ""

                    address = ""
                    website = ""
                    IM = ""
                    birthDay = ""
                    avatar = ""

                    browser.execute_script(
                        "window.scrollTo(0, 500);")  #kéo thanh cuộn xuống .
                    time.sleep(5)
                    try:
                        avatar = browser.find_element_by_xpath(
                            "//*[@class=' presence-entity__image EntityPhoto-circle-8 ember-view']"
                        ).get_attribute('style')
                        urls = re.findall(
                            'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
                            avatar)
                        avatar = urls[0]
                    except Exception as e:
                        avatar = ""
                    browser.execute_script(
                        "window.scrollTo(0, 1000);")  #kéo thanh cuộn xuống .
                    time.sleep(5)
                    browser.execute_script(
                        "window.scrollTo(0, document.body.scrollHeight);"
                    )  #kéo thanh cuộn xuống .
                    time.sleep(5)

                    try:
                        address = browser.find_element_by_xpath(
                            "//*[@class='pv-contact-info__contact-type ci-address']"
                        ).text
                    except Exception as e:
                        address = ""
                    try:
                        IM = browser.find_element_by_xpath(
                            "//*[@class='pv-contact-info__contact-type ci-ims']"
                        ).text
                    except Exception as e:
                        IM = ""
                    try:
                        birthDay = browser.find_element_by_xpath(
                            "//*[@class='pv-contact-info__contact-type ci-birthday']"
                        ).text
                    except Exception as e:
                        birthDay = ""
                    try:
                        website = browser.find_element_by_xpath(
                            "//*[@class='pv-contact-info__contact-type ci-websites']"
                        ).text
                    except Exception as e:
                        website = ""
                    try:
                        name = browser.find_element_by_xpath(
                            "//*[@class='pv-top-card-section__name Sans-26px-black-85%']"
                        ).text
                    except Exception as e:
                        name = ""
                    try:
                        headLine = browser.find_element_by_xpath(
                            "//*[@class='pv-top-card-section__headline Sans-19px-black-85%']"
                        ).text
                    except Exception as e:
                        headLine = ""
                    try:
                        company = browser.find_element_by_xpath(
                            "//*[@class='pv-top-card-section__company Sans-17px-black-70% mb1 inline-block']"
                        ).text
                    except Exception as e:
                        company = ""

                    try:
                        listSchool = browser.find_elements_by_xpath(
                            "//*[@class='pv-education-entity pv-profile-section__card-item ember-view']"
                        )
                        for x in listSchool:
                            school += x.text + "\n"
                    except Exception as e:
                        school = ""
                    location = browser.find_element_by_xpath(
                        "//*[@class='pv-top-card-section__location Sans-17px-black-70% mb1 inline-block']"
                    ).text
                    connections = browser.find_element_by_xpath(
                        "//*[@class='pv-top-card-section__headline Sans-19px-black-85%']"
                    ).text
                    try:
                        summary = browser.find_element_by_xpath(
                            "//*[@class='pv-top-card-section__summary-text Sans-15px-black-55% mt5 pt5 ember-view']"
                        ).text
                    except Exception as e:
                        summary = ""

                    try:
                        listExperiences = browser.find_elements_by_xpath(
                            "//*[@class='pv-profile-section__card-item pv-position-entity ember-view']"
                        )
                        for x in listExperiences:
                            experiences += x.text + "---BREAK---"
                    except Exception as e:
                        experiences = ""

                    education = ""
                    try:
                        # pv-entity__summary-info
                        listEducation = browser.find_elements_by_xpath(
                            "//*[@class='pv-profile-section__sortable-card-item pv-education-entity pv-profile-section__card-item ember-view']"
                        )
                        for x in listEducation:
                            education += x.text + "---BREAK---"
                    except Exception as e:
                        education = ""
                    skills = ""
                    #  Click show more skill
                    try:
                        browser.find_element_by_xpath(
                            "//*[@class='pv-profile-section__card-action-bar pv-skills-section__additional-skills artdeco-container-card-action-bar']"
                        ).click()
                        time.sleep(4)
                    except Exception as e:
                        print("NOT SHOW MORE SKILL ! ")
                    try:
                        listSkill = browser.find_elements_by_xpath(
                            "//*[@class='pv-skill-entity__skill-name truncate Sans-15px-black-85%-semibold inline-block ']"
                        )
                        for x in listSkill:
                            skills += x.text + "---BREAK---"
                    except Exception as e:
                        skills = ""
                    langaues = ""
                    try:
                        listLanguages = browser.find_elements_by_xpath(
                            "//*[@class='pv-profile-section accordion-panel pv-accomplishments-block languages ember-view']"
                        )

                        for x in listLanguages:
                            langaues += x.text + "---BREAK---"
                    except Exception as e:
                        langaues = ""
                    course = ""
                    try:
                        listCourse = browser.find_elements_by_xpath(
                            "//*[@class='pv-profile-section accordion-panel pv-accomplishments-block courses ember-view']"
                        )
                        for x in listCourse:
                            course += x.text + "---BREAK---"
                    except Exception as e:
                        course = ""

                    project = ""
                    try:
                        listProject = browser.find_elements_by_xpath(
                            "//*[@class='pv-profile-section accordion-panel pv-accomplishments-block projects ember-view']"
                        )
                        for x in listCourse:
                            project += x.text + "---BREAK---"
                    except Exception as e:
                        project = ""

                    publication = ""
                    try:
                        listPublication = browser.find_elements_by_xpath(
                            "//*[@class='pv-profile-section accordion-panel pv-accomplishments-block publications ember-view']"
                        )
                        for x in listCourse:
                            publication += x.text + "---BREAK---"
                    except Exception as e:
                        publication = ""
                    try:
                        phone = browser.find_element_by_xpath(
                            "//*[@class='pv-contact-info__list']").text
                    except Exception as e:
                        phone = ""
                    try:
                        email = browser.find_element_by_xpath(
                            "//*[@class='pv-contact-info__contact-link Sans-15px-black-55%']"
                        ).text
                    except Exception as e:
                        email = ""
                    try:
                        connected_Time = browser.find_element_by_xpath(
                            "//*[@class='pv-contact-info__contact-item Sans-15px-black-55%']"
                        ).text
                    except Exception as e:
                        connected_Time = ""

                # ------------SAVE AS DB
                    try:
                        command = """INSERT INTO  "Linkedin_Detail" (  "Name", "Head_Line", "Company", "Schools ", "Location", "Phone", "Email", "Connected_Date", 
                        "Connection", "Sumary", "Skill", "Language", "Course", "Project", "Publication", "URL" , "Experiences",
                         "Web", "Address", "BirthDay", "IM" ,"Avatar_URL" )
                         VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)"""
                        value = [
                            name, headLine, company, education, location,
                            phone, email, connected_Time, connections, summary,
                            skills, langaues, course, project, publication,
                            URL, experiences, website, address, birthDay, IM,
                            avatar
                        ]
                        MyConnection.insertUpdateDB(command, value)
                        print("INSERT DONE !")
                    except Exception as e:
                        print("INSERT ERROR ! ", e)
                    time.sleep(randint(5, 20))
                    ## Update Crawl with linkedin URL !
                    try:
                        command = """UPDATE "Linkedin_URL" SET "Is_Crawl" = '1' WHERE "Linkedin_URL" = ? """
                        value = [URL]
                        MyConnection.insertUpdateDB(command, value)
                        print(" Update DONE ! ")
                    except Exception as e:
                        print("UPDATE ERROR !  ", e)

                except Exception as e:
                    print(e)
        except Exception as e:
            print("ERROR ! ", e)
            browser.close()