Exemple #1
0
 def updateDB(self, exe_ID):
     try:
         sQuery = """ UPDATE   [LinkedIn_Experience_Header] SET [Is_Crawl] = 1 WHERE [Exp_Header_Id] = ? """
         value = [exe_ID]
         MyConnection.insertUpdateDB(sQuery, value)
         print("UPDATE DONE ! ")
     except Exception as e:
         print("UPDATE ERROR ! ", e)
Exemple #2
0
 def insertDB(self, key, tag, exe_ID, URL):
     try:
         sQuery = """ INSERT INTO [dbo].[Linkedin_Experience_Tag]
         ([Exp_Header_Id]
         ,[URL]
         ,[Tag]
         ,[Tag_Type])
     VALUES (?,?,?,?)"""
         value = [exe_ID, URL, key, tag]
         MyConnection.insertUpdateDB(sQuery, value)
         print("INSERT DONE ! ")
     except Exception as e:
         print("INSERT ERROR ! ", e)
 def updateSpiderStatus(self, flag):
     # Disable !
     if (int(flag) == 0):
         squery = """UPDATE [Spider_Status] SET [Status] = 0 , [Last_Page_Crawl] =? 
         , [Last_Stop]= ? , [Last_Start] = ? , [URL] = ?  WHERE [Row_ID] = ? """
         value = [
             "",
             datetime.now(), self.timeStart, self.current_URL,
             self.Spider_ID
         ]
         MyConnection.insertUpdateDB(squery, value)
     # Enable !
     else:
         squery = """UPDATE [Spider_Status] SET [Status] = 1, [URL] = ?  WHERE [Row_ID] = ? """
         value = [self.current_URL, self.Spider_ID]
         MyConnection.insertUpdateDB(squery, value)
         self.timeStart = datetime.now()
 def getListURL(self):
     sQuery = """ SELECT DISTINCT "Linkedin_URL" from "Linkedin_URL" WHERE "Is_Crawl" is NULL AND "Row_ID" < 20000  """
     conn = MyConnection.getConnection()
     cursor = conn.cursor()
     cursor.execute(sQuery)
     result = cursor.fetchone()
     while result:
         self.listURL.append(result[0])
         result = cursor.fetchone()
     conn.close()
Exemple #5
0
    def main(self):
        try:
            sQuery = """   SELECT  DISTINCT "URL"  ,"Head_Line", "Schools ",
            "Sumary", "Skill", "Experiences"   from "Linkedin_Detail" where "Is_Picked" is  null """
            conn = MyConnection.getConnection()
            cursor = conn.cursor()
            cursor.execute(sQuery)
            result = cursor.fetchone()
            HeadLine = ""
            Schools = ""
            Sumary = ""
            Skill = ""
            Experiences = ""

            while result:
                url = result[0]
                HeadLine = result[1]
                Schools = result[2]
                # Sumary = result[3]
                Skill = result[4]
                Experiences = result[5]

                ## GET ALL DETAIL !

                li_Schools = Schools.split("---BREAK---")
                li_Sumary = Sumary.split("---BREAK---")
                li_Skill = Skill.split("---BREAK---")
                li_Experiences = Experiences.split("---BREAK---")

                ## Solve HeadLine !
                HeadLine = str(HeadLine).upper()
                # Kiểm tra trong Headline có từ "AT" không  ?
                li_Headline = HeadLine.split()
                if ("AT" in li_Headline):
                    index = int(li_Headline.index("AT"))
                    HeadLine = ' '.join(li_Headline[0:(index - 1)])

            ## Solve Company !
                for x in li_Schools:
                    try:
                        temp = x.split("\n")
                        school_Name = temp[0]
                        title = temp[2]
                        duration = temp[6]
                        field_Study = temp[4]
                        active = temp[7].replace('Activities and Societies:',
                                                 '')

                        sQuery = """ INSERT INTO public."Linkedin_School"(
                        "School_Name", "Secondary_Title", "Date_Attended", 
                        "Activitive", "Linkedin_URL", "Field_Study")
                        VALUES (?, ?, ?, ?, ?, ?) """
                        value = [
                            school_Name, title, duration, active, url,
                            field_Study
                        ]

                        MyConnection.insertUpdateDB(sQuery, value)
                        print("INSERT SCHOOL DONE ! ")

                    except Exception as e:
                        print("Pick Schools ERROR ", e)

            # Solving Experiences !
                for x in li_Experiences:
                    try:
                        temp = x.split("\n")
                        position = temp[0]
                        company_Name = temp[2]
                        date_Empl = temp[4]
                        empl_Duration = temp[6]

                        location = ""
                        desctiption = ""
                        if (7 < len(temp)):
                            location = temp[8]
                            desctiption = ' '.join(temp[9:(len(temp))])
                            print(desctiption)

                        sQuery = """INSERT INTO public."Linkedin_Experiences"(
                            "Position", "Company_Name", "Date_Employee", "Employee_Duration", "Location", 
                            "Description", "URL")
                            VALUES (?, ?, ?, ?, ?, ?, ?) """
                        value = [
                            position, company_Name, date_Empl, empl_Duration,
                            location, desctiption, url
                        ]

                        MyConnection.insertUpdateDB(sQuery, value)
                        print("INSERT EXPERIENCES DONE ! ")
                    except Exception as e:
                        print("Pick EXPERIENCES ERROR ", e)

            # Solving Skill !
                for x in li_Skill:
                    try:
                        temp = x.split("\n")
                        for i in temp:
                            sQuery = """INSERT INTO public."Linkedin_Skill"(
                            "Skill", "URL")
                            VALUES (?,?)"""
                            value = [i.strip(), url]

                            MyConnection.insertUpdateDB(sQuery, value)
                            print("INSERT SKILL DONE ! ")
                    except Exception as e:
                        print("INSERT SKILL ERROR ! ", e)

            ## UPDATE Is_Picked for thí URL !
                try:
                    sQuery = """  UPDATE "Linkedin_Detail" SET "Is_Picked" = '1'  WHERE "URL" =  ?"""
                    value = [url]
                    MyConnection.insertUpdateDB(sQuery, value)
                    print("Update DONE ! ")
                except Exception as e:
                    print("UPDATE ERROR ! ", e)
                result = cursor.fetchone()
            conn.close()

        except Exception as e:
            print("ERROR ! ", e)
    def main(self):
        try:
            sQuery = """ SELECT DISTINCT [URL] , [Skill] , [Row_ID], [Schools ]
            FROM [DB_LINKEDIN].[dbo].[Linkedin_Detail]
            WHERE  Row_ID = '1523' """
            conn = MyConnection.getConnection()
            cursor = conn.cursor()
            cursor.execute(sQuery)
            result = cursor.fetchone()

            while result:
                url = result[0]
                skill = result[1]
                profile_ID = result[2]
                education = result[3]
                ## GET ALL  Experiences DETAIL !

                li_Skill = skill.split('---BREAK---')
                li_Edu = education.split('---BREAK---')

                ## Solve Skill  !
                for x in li_Skill:
                    if (len(x) > 2 and None != x and '' != x):
                        try:
                            sQuery = """INSERT INTO [dbo].[Linkedin_Skill_Header]
                                    ([Profile_ID]
                                    ,[URL]
                                    ,[Skill_Name])
                                VALUES(?,?,?) """
                            value = [profile_ID, url, x]
                            MyConnection.insertUpdateDB(sQuery, value)
                            print("INSERT SKILL DONE ! ")
                        except Exception as e:
                            print("INSERT SKILL ERROR ! ", e)

            ## Solve Education  !
                for e in li_Edu:
                    if (len(e) > 2 and None != e and '' != e):
                        try:
                            temp = e.split('\n')
                            school_Name = temp[0]
                            Degree_Name = temp[2]
                            field_Of_Study = temp[4]
                            dates_Attended = temp[6]
                            description = ' '.join(temp[7:len(temp)])

                            sQuery = """INSERT INTO [dbo].[Linkedin_Education_Header]
                                ([Profile_ID]
                                ,[URL]
                                ,[School_Name]
                                ,[Degree_Name]
                                ,[Field_Of_Study]
                                ,[Dates_Attended_Or_Expected_graduation]
                                ,[Education_Description])
                            VALUES (?,?,?,?,?,?,?) """
                            value = [
                                profile_ID, url, school_Name, Degree_Name,
                                field_Of_Study, dates_Attended, description
                            ]
                            MyConnection.insertUpdateDB(sQuery, value)

                            print("INSERT EDU DONE ! ")
                        except Exception as e:
                            print("INSERT EDU ERROR ! ", e)

                result = cursor.fetchone()
            conn.close()

        except Exception as e:
            print("ERROR ! ", e)
Exemple #7
0
    def main(self):
        try:
            # sQuery = """ SELECT DISTINCT [URL] , [Experiences] , [Location], [Avatar_URL], [Row_ID]
            # FROM [DB_LINKEDIN].[dbo].[Linkedin_Detail]
            # WHERE  Row_ID = '1507' """

            sQuery = """SELECT DISTINCT [URL] , [Experiences] , [Location], [Avatar_URL], [Row_ID]
            FROM [DB_LINKEDIN].[dbo].[Linkedin_Detail]
            WHERE    len (Experiences) > 5 and Row_ID < 1630  AND [Is_Tag] IS NULL """
            conn = MyConnection.getConnection()
            cursor = conn.cursor()
            cursor.execute(sQuery)
            result = cursor.fetchone()

            while result:
                temp_Experiences = result[1]
                location = result[2]
                url = result[0]
                avatar = result[3]
                profile_ID = result[4]
                ## GET ALL  Experiences DETAIL !
                li_Experiences = temp_Experiences.split('---BREAK---')

                ## Solve HeadLine !
                for e in li_Experiences:
                    temp_Element = e.split('\n')

                    try:
                        position = temp_Element[0]
                        company_Name = temp_Element[2]
                        date_Employee = temp_Element[4]
                        employee_Duration = temp_Element[6]
                        location = temp_Element[8]
                        description_Exp = ' '.join(
                            temp_Element[9:(len(temp_Element))])

                        sQuery = """ INSERT INTO [dbo].[LinkedIn_Experience_Header]
                                ( [URL]
                                ,[Position]
                                ,[Company_Name]
                                ,[Dates_Employed]
                                ,[Employment_Duration]
                                ,[Location]
                                ,[Experience_Description]
                                , [Profile_Id])
                            VALUES (?,?,?,?,?,?,?,?) """
                        value = [
                            url, position, company_Name, date_Employee,
                            employee_Duration, location, description_Exp,
                            profile_ID
                        ]
                        MyConnection.insertUpdateDB(sQuery, value)
                        print("INSERT DONE ! ")
                    except Exception as e:
                        print("NOT SAME PATTERN ! ", e)
                # UPDATE   !
                try:
                    sQuery = """ UPDATE [Linkedin_Detail]  SET Is_Tag = 1 WHERE  [Row_ID] = ? """
                    value = [profile_ID]
                    MyConnection.insertUpdateDB(sQuery, value)
                    print("UPDATE DONE ")
                except Exception as e:
                    print("UPDATE ERROR ! ", e)
                result = cursor.fetchone()
            conn.close()

        except Exception as e:
            print("ERROR ! ", e)
Exemple #8
0
    def main(self):
        sQuery = """ SELECT [Exp_Header_Id] , [Experience_Description], [URL] FROM [LinkedIn_Experience_Header]
         WHERE [Is_Crawl] IS NULL AND len([Experience_Description]) >2  """
        conn = MyConnection.getConnection()
        cursor = conn.cursor()
        cursor.execute(sQuery)
        result = cursor.fetchone()
        while result:
            # GET exe_ID & exe_Description !
            exe_ID = result[0]
            sentence = result[1]
            url = result[2]

            # Solving data !
            temp = sentence.split('-')

            for x in temp:
                if (len(x) > 5):  # Loại bỏ trường hợp bị null !
                    x = x.replace(',', '')
                    x = x.replace('(', '')
                    x = x.replace(')', '')
                    x = x.replace('.', '')
                    x = x.replace('*', '')
                    x = x.replace('#', '')
                    tokens = nltk.word_tokenize(x)
                    tagged = nltk.pos_tag(tokens)
                    entities = nltk.chunk.ne_chunk(tagged)
                    # print(entities)
                    for y in entities:
                        a1 = ""
                        a2 = ""
                        if (1 == len(y)):
                            a = str(y)
                            a = str(y).replace('(', '')
                            a = a.replace(')', '')
                            a = a.replace("'", "")
                            a = a.replace(',', '')
                            a = a.split()
                            a2 = a[1]
                            a_temp = a2.split('/')
                            a1 = a_temp[0]
                            a2 = a_temp[1]

                        if (len(y) > 2):
                            for i in y:
                                a = ' '.join(i[0:len(i)])
                                a = a.split()
                                a1 = a[0]
                                a2 = a[1]
                                # print(a1 , a2)

                        if (2 == len(y)):
                            a = str(y).replace('(', '')
                            a = a.replace(')', '')
                            a = a.replace("'", "")
                            a = a.replace(',', '')
                            a = a.split()
                            a1 = a[0]
                            a2 = a[1]
                        if (len(a1) > 1):
                            self.insertDB(a1, a2, exe_ID, url)
                        self.updateDB(exe_ID)

            result = cursor.fetchone()
        conn.close()
    def main(self, country, start, end, ID):
        try:

            ## GET ALL COMPANY NAME TO  SEARCH  !
            listKey = {}
            connection = MyConnection.getConnection()
            cursor = connection.cursor()

            ## GET COMMPANY NAME  USED TO SEARCH WITH BING!!
            SQLCommand = """SELECT DISTINCT [D-U-N-S] , [Company_Name_Clean] FROM [Company] WHERE
             [Company_Name_Clean] is not null  AND [Is_Crawl] is NULL and [Row_ID] >= ? and [Row_ID]  < ?  """
            value = [start, end]
            cursor.execute(SQLCommand, value)
            results = cursor.fetchone()
            while results:
                DUNS_ID = int(results[0])  # this is DUNS Number this comapany
                listKey[DUNS_ID] = results[1]  # Name of Company
                results = cursor.fetchone()
            connection.close()
            print("--------------> Total Size : ", len(listKey))
            ## Config for browser do not open web browser !
            options = webdriver.ChromeOptions()
            options.add_argument('headless')
            browser = webdriver.Chrome(chrome_options=options)

            browser.get("https://www.bing.com/")
            time.sleep(4)

            for duns in listKey.keys():

                time.sleep(randint(8, 15))

                print("Company Search : ", listKey[duns])
                keys = " \"VietNam \" " + " site: linkedin.com/in " + "\"" + listKey[
                    duns] + " \""
                KeySearch = browser.find_element_by_xpath(
                    "//*[@class='b_searchbox']")
                KeySearch.clear()
                KeySearch.send_keys(keys)  # Truyền tên cty vào để search
                time.sleep(randint(2, 6))
                # KeySearch.send_keys(u'\ue007') # Enter search !
                btnSearch = browser.find_element_by_xpath(
                    "//*[@id='sb_form_go']").click()
                sleep = int(randint(3, 15))
                time.sleep(sleep)

                try:
                    allRow = browser.find_elements_by_xpath(
                        "//*[@class='b_algo']")
                    count_2 = 0
                    if (0 < len(allRow)):
                        while True:
                            timeSpleep = randint(
                                5, 10)  #  Random delay time from  3 - 20s
                            print("TIME DELAY : ", timeSpleep)
                            time.sleep(timeSpleep)
                            if (50 == count_2):
                                time.sleep(40)
                                count_2 = 0
                            allRow = browser.find_elements_by_xpath(
                                "//*[@class='b_algo']")
                            browser.execute_script(
                                "window.scrollTo(0, document.body.scrollHeight);"
                            )  #kéo thanh cuộn xuống .
                            print("--------> Page Size : ", len(allRow))
                            for x in allRow:
                                txt = x.text
                                temp = txt.split()
                                findString = txt.find("LinkedIn")
                                urls = re.findall(
                                    'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
                                    x.text)
                                companyName = txt[0:int(findString)]
                                urls = ''.join(urls)
                                print(urls)
                                print(
                                    "--------------------------------------------------"
                                )

                                # Save to DB !
                                try:
                                    command = """INSERT INTO  [Linkedin_URL]
                                    ([Linkedin_Name]
                                    ,[Linkedin_URL]
                                    ,[DUNS_NUMBER]
                                    ,[Linkedin_Type]
                                    ,[Country])
                                VALUES (?,?,?,?,? )"""
                                    value = [
                                        companyName, urls, duns, "Profile",
                                        country
                                    ]
                                    MyConnection.insertUpdateDB(command, value)
                                    print("INSERT DONE ! ")
                                    count_2 += 1
                                except Exception as e:
                                    print("INSERT CRAWL ERROR : ", e)
                            browser.find_element_by_xpath(
                                "//*[@class='sb_pagN']").click()  # NExt page!
                            time.sleep(3)
                except Exception as e:
                    print("ERROR  WHEN CRAWLING ! ", e)

                # xác nhận đã search với keyword đó rồi :
                try:
                    command = """UPDATE [Company]  SET [Is_Crawl] = 1 WHERE  [D-U-N-S] = ? """
                    value = [duns]
                    MyConnection.insertUpdateDB(command, value)
                    print("UPDATE Crawl DONE ! ")
                    time.sleep(2)
                except Exception as e:
                    print("UPDATE Crawl Error : ", e)
        except Exception as e:
            print("ERROR IS : ", e)
    def main(self):
        try:

            self.getListURL()
            ## Config for browser do not open web browser !
            # options = webdriver.ChromeOptions()
            browser = webdriver.Chrome()
            # options = webdriver.ChromeOptions()
            # options.add_argument('headless')
            # browser = webdriver.Chrome(chrome_options=options)

            browser.get("https://www.google.com")
            time.sleep(20)
            browser.get("https://www.linkedin.com")
            # time.sleep(60)
            username = browser.find_element_by_xpath(
                "//*[@class='login-email']")
            password = browser.find_element_by_xpath(
                "//*[@class='login-password']")
            username.send_keys("*****@*****.**")
            password.send_keys("duybaoo19")
            time.sleep(4)
            browser.find_element_by_xpath(
                "//*[@class='login submit-button']").click()
            time.sleep(randint(20, 40))

            # GET DEATIL FOR EACH URL IN listURL !
            for URL in self.listURL:
                try:
                    browser.get(URL)
                    # browser.get("https://www.linkedin.com/in/vomanhtoan/")
                    print(URL)
                    time.sleep(randint(5, 20))

                    # Show information profile !
                    try:
                        browser.find_element_by_xpath(
                            "//*[@class='contact-see-more-less link-without-visited-state']"
                        ).click()
                    except Exception as e:
                        print(e)
                    time.sleep(3)
                    # KHAI BAO BIEN
                    name = ""
                    headLine = ""
                    company = ""
                    location = ""
                    connections = ""
                    summary = ""
                    name = ""
                    headLine = ""
                    company = ""
                    school = ""
                    phone = ""
                    email = ""
                    connected_Time = ""
                    experiences = ""

                    address = ""
                    website = ""
                    IM = ""
                    birthDay = ""
                    avatar = ""

                    browser.execute_script(
                        "window.scrollTo(0, 500);")  #kéo thanh cuộn xuống .
                    time.sleep(5)
                    try:
                        avatar = browser.find_element_by_xpath(
                            "//*[@class=' presence-entity__image EntityPhoto-circle-8 ember-view']"
                        ).get_attribute('style')
                        urls = re.findall(
                            'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
                            avatar)
                        avatar = urls[0]
                    except Exception as e:
                        avatar = ""
                    browser.execute_script(
                        "window.scrollTo(0, 1000);")  #kéo thanh cuộn xuống .
                    time.sleep(5)
                    browser.execute_script(
                        "window.scrollTo(0, document.body.scrollHeight);"
                    )  #kéo thanh cuộn xuống .
                    time.sleep(5)

                    try:
                        address = browser.find_element_by_xpath(
                            "//*[@class='pv-contact-info__contact-type ci-address']"
                        ).text
                    except Exception as e:
                        address = ""
                    try:
                        IM = browser.find_element_by_xpath(
                            "//*[@class='pv-contact-info__contact-type ci-ims']"
                        ).text
                    except Exception as e:
                        IM = ""
                    try:
                        birthDay = browser.find_element_by_xpath(
                            "//*[@class='pv-contact-info__contact-type ci-birthday']"
                        ).text
                    except Exception as e:
                        birthDay = ""
                    try:
                        website = browser.find_element_by_xpath(
                            "//*[@class='pv-contact-info__contact-type ci-websites']"
                        ).text
                    except Exception as e:
                        website = ""
                    try:
                        name = browser.find_element_by_xpath(
                            "//*[@class='pv-top-card-section__name Sans-26px-black-85%']"
                        ).text
                    except Exception as e:
                        name = ""
                    try:
                        headLine = browser.find_element_by_xpath(
                            "//*[@class='pv-top-card-section__headline Sans-19px-black-85%']"
                        ).text
                    except Exception as e:
                        headLine = ""
                    try:
                        company = browser.find_element_by_xpath(
                            "//*[@class='pv-top-card-section__company Sans-17px-black-70% mb1 inline-block']"
                        ).text
                    except Exception as e:
                        company = ""

                    try:
                        listSchool = browser.find_elements_by_xpath(
                            "//*[@class='pv-education-entity pv-profile-section__card-item ember-view']"
                        )
                        for x in listSchool:
                            school += x.text + "\n"
                    except Exception as e:
                        school = ""
                    location = browser.find_element_by_xpath(
                        "//*[@class='pv-top-card-section__location Sans-17px-black-70% mb1 inline-block']"
                    ).text
                    connections = browser.find_element_by_xpath(
                        "//*[@class='pv-top-card-section__headline Sans-19px-black-85%']"
                    ).text
                    try:
                        summary = browser.find_element_by_xpath(
                            "//*[@class='pv-top-card-section__summary-text Sans-15px-black-55% mt5 pt5 ember-view']"
                        ).text
                    except Exception as e:
                        summary = ""

                    try:
                        listExperiences = browser.find_elements_by_xpath(
                            "//*[@class='pv-profile-section__card-item pv-position-entity ember-view']"
                        )
                        for x in listExperiences:
                            experiences += x.text + "---BREAK---"
                    except Exception as e:
                        experiences = ""

                    education = ""
                    try:
                        # pv-entity__summary-info
                        listEducation = browser.find_elements_by_xpath(
                            "//*[@class='pv-profile-section__sortable-card-item pv-education-entity pv-profile-section__card-item ember-view']"
                        )
                        for x in listEducation:
                            education += x.text + "---BREAK---"
                    except Exception as e:
                        education = ""
                    skills = ""
                    #  Click show more skill
                    try:
                        browser.find_element_by_xpath(
                            "//*[@class='pv-profile-section__card-action-bar pv-skills-section__additional-skills artdeco-container-card-action-bar']"
                        ).click()
                        time.sleep(4)
                    except Exception as e:
                        print("NOT SHOW MORE SKILL ! ")
                    try:
                        listSkill = browser.find_elements_by_xpath(
                            "//*[@class='pv-skill-entity__skill-name truncate Sans-15px-black-85%-semibold inline-block ']"
                        )
                        for x in listSkill:
                            skills += x.text + "---BREAK---"
                    except Exception as e:
                        skills = ""
                    langaues = ""
                    try:
                        listLanguages = browser.find_elements_by_xpath(
                            "//*[@class='pv-profile-section accordion-panel pv-accomplishments-block languages ember-view']"
                        )

                        for x in listLanguages:
                            langaues += x.text + "---BREAK---"
                    except Exception as e:
                        langaues = ""
                    course = ""
                    try:
                        listCourse = browser.find_elements_by_xpath(
                            "//*[@class='pv-profile-section accordion-panel pv-accomplishments-block courses ember-view']"
                        )
                        for x in listCourse:
                            course += x.text + "---BREAK---"
                    except Exception as e:
                        course = ""

                    project = ""
                    try:
                        listProject = browser.find_elements_by_xpath(
                            "//*[@class='pv-profile-section accordion-panel pv-accomplishments-block projects ember-view']"
                        )
                        for x in listCourse:
                            project += x.text + "---BREAK---"
                    except Exception as e:
                        project = ""

                    publication = ""
                    try:
                        listPublication = browser.find_elements_by_xpath(
                            "//*[@class='pv-profile-section accordion-panel pv-accomplishments-block publications ember-view']"
                        )
                        for x in listCourse:
                            publication += x.text + "---BREAK---"
                    except Exception as e:
                        publication = ""
                    try:
                        phone = browser.find_element_by_xpath(
                            "//*[@class='pv-contact-info__list']").text
                    except Exception as e:
                        phone = ""
                    try:
                        email = browser.find_element_by_xpath(
                            "//*[@class='pv-contact-info__contact-link Sans-15px-black-55%']"
                        ).text
                    except Exception as e:
                        email = ""
                    try:
                        connected_Time = browser.find_element_by_xpath(
                            "//*[@class='pv-contact-info__contact-item Sans-15px-black-55%']"
                        ).text
                    except Exception as e:
                        connected_Time = ""

                # ------------SAVE AS DB
                    try:
                        command = """INSERT INTO  "Linkedin_Detail" (  "Name", "Head_Line", "Company", "Schools ", "Location", "Phone", "Email", "Connected_Date", 
                        "Connection", "Sumary", "Skill", "Language", "Course", "Project", "Publication", "URL" , "Experiences",
                         "Web", "Address", "BirthDay", "IM" ,"Avatar_URL" )
                         VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)"""
                        value = [
                            name, headLine, company, education, location,
                            phone, email, connected_Time, connections, summary,
                            skills, langaues, course, project, publication,
                            URL, experiences, website, address, birthDay, IM,
                            avatar
                        ]
                        MyConnection.insertUpdateDB(command, value)
                        print("INSERT DONE !")
                    except Exception as e:
                        print("INSERT ERROR ! ", e)
                    time.sleep(randint(5, 20))
                    ## Update Crawl with linkedin URL !
                    try:
                        command = """UPDATE "Linkedin_URL" SET "Is_Crawl" = '1' WHERE "Linkedin_URL" = ? """
                        value = [URL]
                        MyConnection.insertUpdateDB(command, value)
                        print(" Update DONE ! ")
                    except Exception as e:
                        print("UPDATE ERROR !  ", e)

                except Exception as e:
                    print(e)
        except Exception as e:
            print("ERROR ! ", e)
            browser.close()
Exemple #11
0
def main():
    nxt_port = False
    default_port = True
    nxt_ip = False
    default_ip = True
    for i in sys.argv:
        if i.lower() == '-h':
            print(
                "\n~~~~//~~~~//~~~~//~~~~//~~~~//~~~~//~~~~//~~~~//~~~~//~~~~//~~~~//~~~//~~~~//~~~~//~~~~//~~~~//~~~~//~~~~//~~~~\n"
            )
            print("Socket Development Test by Emily Souza")
            print(
                "\nThe program was developed using python 3.7 and does NOT work with versions older than 3.6"
            )
            print("\nCommands:\n")
            print("-p,  change the port number (default 50029)")
            print("-a,  change the IP address (default '189.6.76.118')")
            print(
                "-h,  display this help text and exit (ignoring others setting options)"
            )
            print("-i,  display the program introduction right before running")
            print(
                "\nMore information avaliable (in portuguese) in the README file"
            )
            print(
                "\n~~~~//~~~~//~~~~//~~~~//~~~~//~~~~//~~~~//~~~~//~~~~//~~~~//~~~~//~~~//~~~~//~~~~//~~~~//~~~~//~~~~//~~~~//~~~~\n"
            )
            return
        elif nxt_port:
            port = int(i)
            nxt_port = False
            default_port = False
        elif nxt_ip:
            IP = i
            nxt_ip = False
            default_ip = False
        elif i.lower() == '-p':
            nxt_port = True
        elif i.lower() == '-a':
            nxt_ip = True
        elif i.lower() == '-i':
            print(
                "\n~~~~//~~~~//~~~~//~~~~//~~~~//~~~~//~~~~//~~~~//~~~~//~~~~//~~~~//~~~//~~~~//~~~~//~~~~//~~~~//~~~~//~~~~//~~~~\n"
            )
            print(
                "Socket Development Test for the internship selection process of Scytl - Innovating Democracy."
            )
            print("Developer: Emily Souza\n")
            print(
                "The program was developed using python 3.7 and does NOT work with versions older than 3.6\n"
            )
            print("The main functionalities are:")
            print(
                "1) Connecting to Scytl's Server using the IP Address '189.6.76.118' and port 50029"
            )
            print(
                "2) Receiving packets from the server encoded using \"Protocol X\""
            )
            print("3) Decoding the message to obtain the original one")
            print(
                "4) Changing spaces to underlines and strings to lower and upper characters"
            )
            print("5) Inverting the message")
            print("6) Reencoding the message with the same protocol")
            print("7) Sending the resulting data back to the server")
            print("8) Getting a confirmation from the server")
            print(
                "\n~~~~//~~~~//~~~~//~~~~//~~~~//~~~~//~~~~//~~~~//~~~~//~~~~//~~~~//~~~//~~~~//~~~~//~~~~//~~~~//~~~~//~~~~//~~~~\n"
            )
            print("Running the program...\n")

    # Connection info
    if default_port:
        port = 50029
    if default_ip:
        IP = '189.6.76.118'

    # Protocol info
    table1 = {'11110': 0x0, '01001': 0x1, '10100': 0x2, '10101': 0x3, '01010': 0x4, '01011': 0x5,\
       '01110': 0x6, '01111': 0x7, '10010': 0x8, '10011': 0x9, '10110': 0xA, '10111': 0xB, \
       '11010': 0xC, '11011': 0xD, '11100': 0xE, '11101': 0xF}

    protocol = ProtocolX.ProtocolX(table1)

    prime_factor = Util.get_prime_factor(port)

    # Open Connection
    connection = MyConnection.MyConnection(IP, port)

    recv_msg = connection.get_msg()

    # Decode
    pkt_msg = Util.divide_packets(recv_msg)
    decoded_msg = ''
    for p in pkt_msg:
        decoded_msg += protocol.decode_packet(p)

    # Process data
    processed_msg = Util.remove_space(decoded_msg)
    print("Decoded message: " + processed_msg + '\n')

    processed_msg = Util.upper_lower(processed_msg)
    processed_msg = processed_msg.replace(' ', '_')
    processed_msg = processed_msg[::-1]  # invert message

    print("Processed message: " + processed_msg + '\n')

    # Encode
    while len(processed_msg) % 4:
        processed_msg += '_'

    lst = [processed_msg[i:i + 4] for i in range(0, len(processed_msg), 4)]
    msg_to_send = ''
    for i in lst:
        encoded_msg = protocol.encode_packet(i)
        encoded_msg = [
            int(encoded_msg[x:x + 8], 2)
            for x in range(0, len(encoded_msg), 8)
        ]
        encoded_msg = list(map(lambda x: x ^ prime_factor, encoded_msg))
        msg_to_send += chr(0xc6) + reduce(lambda a, b: a + chr(b), encoded_msg,
                                          '') + chr(0x6b)
    msg_to_send = msg_to_send[:len(msg_to_send) - 1] + chr(0x21)

    connection.send_msg(msg_to_send)
    print("Sent message: " +
          reduce(lambda x, y: x + ' ' + hex(ord(y)), msg_to_send, '') + '\n')

    # Confirmation
    msg_confirm = connection.get_msg()
    pkt_msg = Util.divide_packets(msg_confirm)

    msg_confirm = ''
    for p in pkt_msg:
        msg_confirm += protocol.decode_packet(p)
    msg_confirm = Util.remove_space(msg_confirm)

    print("Confirmation message: " + msg_confirm + '\n')

    # Close Connection
    connection.close_socket()