def getListURL(self):
     sQuery = """ SELECT DISTINCT "Linkedin_URL" from "Linkedin_URL" WHERE "Is_Crawl" is NULL AND "Row_ID" < 20000  """
     conn = MyConnection.getConnection()
     cursor = conn.cursor()
     cursor.execute(sQuery)
     result = cursor.fetchone()
     while result:
         self.listURL.append(result[0])
         result = cursor.fetchone()
     conn.close()
Exemple #2
0
    def main(self):
        try:
            sQuery = """   SELECT  DISTINCT "URL"  ,"Head_Line", "Schools ",
            "Sumary", "Skill", "Experiences"   from "Linkedin_Detail" where "Is_Picked" is  null """
            conn = MyConnection.getConnection()
            cursor = conn.cursor()
            cursor.execute(sQuery)
            result = cursor.fetchone()
            HeadLine = ""
            Schools = ""
            Sumary = ""
            Skill = ""
            Experiences = ""

            while result:
                url = result[0]
                HeadLine = result[1]
                Schools = result[2]
                # Sumary = result[3]
                Skill = result[4]
                Experiences = result[5]

                ## GET ALL DETAIL !

                li_Schools = Schools.split("---BREAK---")
                li_Sumary = Sumary.split("---BREAK---")
                li_Skill = Skill.split("---BREAK---")
                li_Experiences = Experiences.split("---BREAK---")

                ## Solve HeadLine !
                HeadLine = str(HeadLine).upper()
                # Kiểm tra trong Headline có từ "AT" không  ?
                li_Headline = HeadLine.split()
                if ("AT" in li_Headline):
                    index = int(li_Headline.index("AT"))
                    HeadLine = ' '.join(li_Headline[0:(index - 1)])

            ## Solve Company !
                for x in li_Schools:
                    try:
                        temp = x.split("\n")
                        school_Name = temp[0]
                        title = temp[2]
                        duration = temp[6]
                        field_Study = temp[4]
                        active = temp[7].replace('Activities and Societies:',
                                                 '')

                        sQuery = """ INSERT INTO public."Linkedin_School"(
                        "School_Name", "Secondary_Title", "Date_Attended", 
                        "Activitive", "Linkedin_URL", "Field_Study")
                        VALUES (?, ?, ?, ?, ?, ?) """
                        value = [
                            school_Name, title, duration, active, url,
                            field_Study
                        ]

                        MyConnection.insertUpdateDB(sQuery, value)
                        print("INSERT SCHOOL DONE ! ")

                    except Exception as e:
                        print("Pick Schools ERROR ", e)

            # Solving Experiences !
                for x in li_Experiences:
                    try:
                        temp = x.split("\n")
                        position = temp[0]
                        company_Name = temp[2]
                        date_Empl = temp[4]
                        empl_Duration = temp[6]

                        location = ""
                        desctiption = ""
                        if (7 < len(temp)):
                            location = temp[8]
                            desctiption = ' '.join(temp[9:(len(temp))])
                            print(desctiption)

                        sQuery = """INSERT INTO public."Linkedin_Experiences"(
                            "Position", "Company_Name", "Date_Employee", "Employee_Duration", "Location", 
                            "Description", "URL")
                            VALUES (?, ?, ?, ?, ?, ?, ?) """
                        value = [
                            position, company_Name, date_Empl, empl_Duration,
                            location, desctiption, url
                        ]

                        MyConnection.insertUpdateDB(sQuery, value)
                        print("INSERT EXPERIENCES DONE ! ")
                    except Exception as e:
                        print("Pick EXPERIENCES ERROR ", e)

            # Solving Skill !
                for x in li_Skill:
                    try:
                        temp = x.split("\n")
                        for i in temp:
                            sQuery = """INSERT INTO public."Linkedin_Skill"(
                            "Skill", "URL")
                            VALUES (?,?)"""
                            value = [i.strip(), url]

                            MyConnection.insertUpdateDB(sQuery, value)
                            print("INSERT SKILL DONE ! ")
                    except Exception as e:
                        print("INSERT SKILL ERROR ! ", e)

            ## UPDATE Is_Picked for thí URL !
                try:
                    sQuery = """  UPDATE "Linkedin_Detail" SET "Is_Picked" = '1'  WHERE "URL" =  ?"""
                    value = [url]
                    MyConnection.insertUpdateDB(sQuery, value)
                    print("Update DONE ! ")
                except Exception as e:
                    print("UPDATE ERROR ! ", e)
                result = cursor.fetchone()
            conn.close()

        except Exception as e:
            print("ERROR ! ", e)
    def main(self):
        try:
            sQuery = """ SELECT DISTINCT [URL] , [Skill] , [Row_ID], [Schools ]
            FROM [DB_LINKEDIN].[dbo].[Linkedin_Detail]
            WHERE  Row_ID = '1523' """
            conn = MyConnection.getConnection()
            cursor = conn.cursor()
            cursor.execute(sQuery)
            result = cursor.fetchone()

            while result:
                url = result[0]
                skill = result[1]
                profile_ID = result[2]
                education = result[3]
                ## GET ALL  Experiences DETAIL !

                li_Skill = skill.split('---BREAK---')
                li_Edu = education.split('---BREAK---')

                ## Solve Skill  !
                for x in li_Skill:
                    if (len(x) > 2 and None != x and '' != x):
                        try:
                            sQuery = """INSERT INTO [dbo].[Linkedin_Skill_Header]
                                    ([Profile_ID]
                                    ,[URL]
                                    ,[Skill_Name])
                                VALUES(?,?,?) """
                            value = [profile_ID, url, x]
                            MyConnection.insertUpdateDB(sQuery, value)
                            print("INSERT SKILL DONE ! ")
                        except Exception as e:
                            print("INSERT SKILL ERROR ! ", e)

            ## Solve Education  !
                for e in li_Edu:
                    if (len(e) > 2 and None != e and '' != e):
                        try:
                            temp = e.split('\n')
                            school_Name = temp[0]
                            Degree_Name = temp[2]
                            field_Of_Study = temp[4]
                            dates_Attended = temp[6]
                            description = ' '.join(temp[7:len(temp)])

                            sQuery = """INSERT INTO [dbo].[Linkedin_Education_Header]
                                ([Profile_ID]
                                ,[URL]
                                ,[School_Name]
                                ,[Degree_Name]
                                ,[Field_Of_Study]
                                ,[Dates_Attended_Or_Expected_graduation]
                                ,[Education_Description])
                            VALUES (?,?,?,?,?,?,?) """
                            value = [
                                profile_ID, url, school_Name, Degree_Name,
                                field_Of_Study, dates_Attended, description
                            ]
                            MyConnection.insertUpdateDB(sQuery, value)

                            print("INSERT EDU DONE ! ")
                        except Exception as e:
                            print("INSERT EDU ERROR ! ", e)

                result = cursor.fetchone()
            conn.close()

        except Exception as e:
            print("ERROR ! ", e)
Exemple #4
0
    def main(self):
        try:
            # sQuery = """ SELECT DISTINCT [URL] , [Experiences] , [Location], [Avatar_URL], [Row_ID]
            # FROM [DB_LINKEDIN].[dbo].[Linkedin_Detail]
            # WHERE  Row_ID = '1507' """

            sQuery = """SELECT DISTINCT [URL] , [Experiences] , [Location], [Avatar_URL], [Row_ID]
            FROM [DB_LINKEDIN].[dbo].[Linkedin_Detail]
            WHERE    len (Experiences) > 5 and Row_ID < 1630  AND [Is_Tag] IS NULL """
            conn = MyConnection.getConnection()
            cursor = conn.cursor()
            cursor.execute(sQuery)
            result = cursor.fetchone()

            while result:
                temp_Experiences = result[1]
                location = result[2]
                url = result[0]
                avatar = result[3]
                profile_ID = result[4]
                ## GET ALL  Experiences DETAIL !
                li_Experiences = temp_Experiences.split('---BREAK---')

                ## Solve HeadLine !
                for e in li_Experiences:
                    temp_Element = e.split('\n')

                    try:
                        position = temp_Element[0]
                        company_Name = temp_Element[2]
                        date_Employee = temp_Element[4]
                        employee_Duration = temp_Element[6]
                        location = temp_Element[8]
                        description_Exp = ' '.join(
                            temp_Element[9:(len(temp_Element))])

                        sQuery = """ INSERT INTO [dbo].[LinkedIn_Experience_Header]
                                ( [URL]
                                ,[Position]
                                ,[Company_Name]
                                ,[Dates_Employed]
                                ,[Employment_Duration]
                                ,[Location]
                                ,[Experience_Description]
                                , [Profile_Id])
                            VALUES (?,?,?,?,?,?,?,?) """
                        value = [
                            url, position, company_Name, date_Employee,
                            employee_Duration, location, description_Exp,
                            profile_ID
                        ]
                        MyConnection.insertUpdateDB(sQuery, value)
                        print("INSERT DONE ! ")
                    except Exception as e:
                        print("NOT SAME PATTERN ! ", e)
                # UPDATE   !
                try:
                    sQuery = """ UPDATE [Linkedin_Detail]  SET Is_Tag = 1 WHERE  [Row_ID] = ? """
                    value = [profile_ID]
                    MyConnection.insertUpdateDB(sQuery, value)
                    print("UPDATE DONE ")
                except Exception as e:
                    print("UPDATE ERROR ! ", e)
                result = cursor.fetchone()
            conn.close()

        except Exception as e:
            print("ERROR ! ", e)
Exemple #5
0
    def main(self):
        sQuery = """ SELECT [Exp_Header_Id] , [Experience_Description], [URL] FROM [LinkedIn_Experience_Header]
         WHERE [Is_Crawl] IS NULL AND len([Experience_Description]) >2  """
        conn = MyConnection.getConnection()
        cursor = conn.cursor()
        cursor.execute(sQuery)
        result = cursor.fetchone()
        while result:
            # GET exe_ID & exe_Description !
            exe_ID = result[0]
            sentence = result[1]
            url = result[2]

            # Solving data !
            temp = sentence.split('-')

            for x in temp:
                if (len(x) > 5):  # Loại bỏ trường hợp bị null !
                    x = x.replace(',', '')
                    x = x.replace('(', '')
                    x = x.replace(')', '')
                    x = x.replace('.', '')
                    x = x.replace('*', '')
                    x = x.replace('#', '')
                    tokens = nltk.word_tokenize(x)
                    tagged = nltk.pos_tag(tokens)
                    entities = nltk.chunk.ne_chunk(tagged)
                    # print(entities)
                    for y in entities:
                        a1 = ""
                        a2 = ""
                        if (1 == len(y)):
                            a = str(y)
                            a = str(y).replace('(', '')
                            a = a.replace(')', '')
                            a = a.replace("'", "")
                            a = a.replace(',', '')
                            a = a.split()
                            a2 = a[1]
                            a_temp = a2.split('/')
                            a1 = a_temp[0]
                            a2 = a_temp[1]

                        if (len(y) > 2):
                            for i in y:
                                a = ' '.join(i[0:len(i)])
                                a = a.split()
                                a1 = a[0]
                                a2 = a[1]
                                # print(a1 , a2)

                        if (2 == len(y)):
                            a = str(y).replace('(', '')
                            a = a.replace(')', '')
                            a = a.replace("'", "")
                            a = a.replace(',', '')
                            a = a.split()
                            a1 = a[0]
                            a2 = a[1]
                        if (len(a1) > 1):
                            self.insertDB(a1, a2, exe_ID, url)
                        self.updateDB(exe_ID)

            result = cursor.fetchone()
        conn.close()
    def main(self, country, start, end, ID):
        try:

            ## GET ALL COMPANY NAME TO  SEARCH  !
            listKey = {}
            connection = MyConnection.getConnection()
            cursor = connection.cursor()

            ## GET COMMPANY NAME  USED TO SEARCH WITH BING!!
            SQLCommand = """SELECT DISTINCT [D-U-N-S] , [Company_Name_Clean] FROM [Company] WHERE
             [Company_Name_Clean] is not null  AND [Is_Crawl] is NULL and [Row_ID] >= ? and [Row_ID]  < ?  """
            value = [start, end]
            cursor.execute(SQLCommand, value)
            results = cursor.fetchone()
            while results:
                DUNS_ID = int(results[0])  # this is DUNS Number this comapany
                listKey[DUNS_ID] = results[1]  # Name of Company
                results = cursor.fetchone()
            connection.close()
            print("--------------> Total Size : ", len(listKey))
            ## Config for browser do not open web browser !
            options = webdriver.ChromeOptions()
            options.add_argument('headless')
            browser = webdriver.Chrome(chrome_options=options)

            browser.get("https://www.bing.com/")
            time.sleep(4)

            for duns in listKey.keys():

                time.sleep(randint(8, 15))

                print("Company Search : ", listKey[duns])
                keys = " \"VietNam \" " + " site: linkedin.com/in " + "\"" + listKey[
                    duns] + " \""
                KeySearch = browser.find_element_by_xpath(
                    "//*[@class='b_searchbox']")
                KeySearch.clear()
                KeySearch.send_keys(keys)  # Truyền tên cty vào để search
                time.sleep(randint(2, 6))
                # KeySearch.send_keys(u'\ue007') # Enter search !
                btnSearch = browser.find_element_by_xpath(
                    "//*[@id='sb_form_go']").click()
                sleep = int(randint(3, 15))
                time.sleep(sleep)

                try:
                    allRow = browser.find_elements_by_xpath(
                        "//*[@class='b_algo']")
                    count_2 = 0
                    if (0 < len(allRow)):
                        while True:
                            timeSpleep = randint(
                                5, 10)  #  Random delay time from  3 - 20s
                            print("TIME DELAY : ", timeSpleep)
                            time.sleep(timeSpleep)
                            if (50 == count_2):
                                time.sleep(40)
                                count_2 = 0
                            allRow = browser.find_elements_by_xpath(
                                "//*[@class='b_algo']")
                            browser.execute_script(
                                "window.scrollTo(0, document.body.scrollHeight);"
                            )  #kéo thanh cuộn xuống .
                            print("--------> Page Size : ", len(allRow))
                            for x in allRow:
                                txt = x.text
                                temp = txt.split()
                                findString = txt.find("LinkedIn")
                                urls = re.findall(
                                    'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
                                    x.text)
                                companyName = txt[0:int(findString)]
                                urls = ''.join(urls)
                                print(urls)
                                print(
                                    "--------------------------------------------------"
                                )

                                # Save to DB !
                                try:
                                    command = """INSERT INTO  [Linkedin_URL]
                                    ([Linkedin_Name]
                                    ,[Linkedin_URL]
                                    ,[DUNS_NUMBER]
                                    ,[Linkedin_Type]
                                    ,[Country])
                                VALUES (?,?,?,?,? )"""
                                    value = [
                                        companyName, urls, duns, "Profile",
                                        country
                                    ]
                                    MyConnection.insertUpdateDB(command, value)
                                    print("INSERT DONE ! ")
                                    count_2 += 1
                                except Exception as e:
                                    print("INSERT CRAWL ERROR : ", e)
                            browser.find_element_by_xpath(
                                "//*[@class='sb_pagN']").click()  # NExt page!
                            time.sleep(3)
                except Exception as e:
                    print("ERROR  WHEN CRAWLING ! ", e)

                # xác nhận đã search với keyword đó rồi :
                try:
                    command = """UPDATE [Company]  SET [Is_Crawl] = 1 WHERE  [D-U-N-S] = ? """
                    value = [duns]
                    MyConnection.insertUpdateDB(command, value)
                    print("UPDATE Crawl DONE ! ")
                    time.sleep(2)
                except Exception as e:
                    print("UPDATE Crawl Error : ", e)
        except Exception as e:
            print("ERROR IS : ", e)