Beispiel #1
0
    def search_jobs(self,keywords_to_scrap):
        #keywords_to_scrap={1:'website'}
        self.driver.save_screenshot("/tmp/upwrk.png")
        try:
            for key,attributes_list in keywords_to_scrap.iteritems():
                keywrd=attributes_list[0]
                elem = WebDriverWait(self.driver, 10).until(lambda x: x.find_element_by_css_selector("#eoFreelancerSearchInput"))
                elem.clear()
                logger.info("Searching keywords in job listing\n")
                logger.info("filtering for %s " %keywrd)
                sleep(20)
                elem.send_keys(keywrd)
                elem.send_keys(Keys.RETURN)
                #parse first page
                sleep(120)
                self.parse_page(key)
                # Navigate through the pagination from results  returned after search
                page_limit=attributes_list[1]
                if page_limit>1:
                    print("Scrolling to page two")
                    visitated_links=[]
                    for i in range(page_limit):
                        #self.driver.execute_script("document.querySelector('footer > div').style.height = '50px'")
                        pagination_list=self.driver.find_elements_by_css_selector("div ul.pagination li a")
#                         for num,next_page in enumerate(pagination_list): 
#                             self.driver.execute_script("document.querySelectorAll('.pagination li')[%d].setAttribute('style', 'display:inline-block !important')" %num)
                        for next_page in pagination_list:
                            
                            if (len(next_page.text.strip())==0 or next_page.text.strip()=='1'):
                                pass
                            else:
                                print("Next page text %s" %str(next_page.text))
                                print("Will compare with %s "%str(i+1))
                                if next_page.text==str(i+1) and next_page.text not in visitated_links:
                                    logger.info("Scrolling to page %s" %str(next_page.text))     
                                    visitated_links.append(next_page.text)
                                    next_page.click()
                                    sleep(120)
                                    self.parse_page(key)
                                    break       
        except TimeoutException, e:
            logger.info("Could not open job listing page to search for keywords")
            counter=self._get_counter()
            logger.info("retry counter at %d "%counter)
            if counter!=1:
                self._increment_counter()
                logger.info("retrying")
                self.dbUtil = dbConnection()
                self.cur = self.dbUtil.get_cursor()
                #self.cur.execute("delete  from cookie where site_id=2")
                self.cur.execute("update site set user_agent=Null where  site_id=2")
                self.dbUtil.commit()
                sleep(120)
                self.tear_down()
                self.__init__()
                self.open_job_listing_page()
                keywords_to_scrap=self.get_subscribed_keywords()
                self.search_jobs(keywords_to_scrap)
Beispiel #2
0
 def _get_counter(self):
     self.dbUtil = dbConnection()
     self.cur = self.dbUtil.get_cursor()
     self.cur.execute("select retry from retry_counter where site_id=2")
     data=self.cur.fetchall()
     self.dbUtil.commit()
     counter=0        
     for row in data:
         counter=row[0]
     return counter
Beispiel #3
0
def jobs_colletor():
    '''
        Collects jobs to be sent and packages then in a dictionary: jobs_proprties
        jobs_proprties: the key is the subscriber id, its value is a map of the jobs properties
        jobs_proprties value which is a map has its key in the format 'subscriber_id:site_id:keyword_id' ;
        the value is a list of the properties in the order: keyword_id, keyword,site_id, site_name,job_detail,email,job_link,post_time,other_info 
    '''
    dbUtil = dbConnection()
    cur = dbUtil.get_cursor()
    jobs_to_dispatch = '''
                    select tt1.subscriber_id as subscriber_id,tt1.keyword_id as keyword_id,
                    tt1.keyword as keyword,tt1.site_id as site_id,tt1.name as site_name,
                    jb.detail as job_detail,tt1.email as email,jb.link as job_link,
                    jb.time_created as post_time,jb.other_info as other_info,jb.job_id as job_id from 
                   (select subscrib.subscriber_id,subscrib.email,keyw.keyword_id,keyw.keyword,sit.site_id,sit.name,subscrip.minimum_alert as jobs_no,keyw.keyword,sit.name
                    from subscription subscrip 
                    inner join subscriber as subscrib on  subscrip.subscriber_id=subscrib.subscriber_id
                    inner join keyword keyw on subscrip.keyword_id=keyw.keyword_id
                    inner join site sit on subscrip.site_id=sit.site_id) as tt1,
                    
                    (select count(*) as jobs_no,jobo.keyword_id as keyword_id,jobo.site_id as site_id,subscrib.subscriber_id as subscriber_id from jobs jobo
                    inner join subscription subs on subs.site_id=jobo.site_id and subs.keyword_id=jobo.keyword_id
                    inner join subscriber subscrib on subscrib.subscriber_id=subs.subscriber_id  
                    where not exists (select 1 from sent_jobs where  subscrib.subscriber_id=subscriber_id and jobo.job_id=job_id)
                    group by subscrib.subscriber_id,jobo.keyword_id,jobo.site_id) as tt2
                    inner join jobs jb on 
                    tt1.site_id=jb.site_id  and tt1.keyword_id=jb.keyword_id and jb.job_id=jb.job_id
                    where  
                    tt1.site_id=tt2.site_id  and tt1.keyword_id=tt2.keyword_id and tt1.subscriber_id=tt2.subscriber_id and tt1.jobs_no<=tt2.jobs_no
                    and not exists (select 1 from sent_jobs where    tt1.subscriber_id=subscriber_id and jb.job_id=job_id)
                    order by tt1.subscriber_id,tt1.name,tt1.keyword
                    '''
    logger.info("Searching for pending jobs to send")
    cur.execute(jobs_to_dispatch)
    data_to_dispatch = cur.fetchall()
    if (data_to_dispatch != None or len(data_to_dispatch) != 0):
        jobs_proprties = {}
        for job in data_to_dispatch:
            keyr = str(job[0]) + ':' + str(job[3]) + ':' + str(
                job[1]) + ':' + str(job[10])
            try:
                jobs_proprties[job[0]]
            except KeyError, e:
                logger.error(e)
                jobs_proprties[job[0]] = collections.OrderedDict()

            try:
                jobs_proprties[job[0]][keyr]
                jobs_proprties = _add_jobs_properties_to_list(
                    jobs_proprties, job)
            except KeyError, e:
                logger.error(e)
                jobs_proprties[job[0]][keyr] = []
                jobs_proprties = _add_jobs_properties_to_list(
                    jobs_proprties, job)
Beispiel #4
0
def update_sent_jobs(message_properties_dictionary):
    logger.info("Updating db on sent items")
    count = 0
    for key in message_properties_dictionary:
        job_dict = message_properties_dictionary[key]
        count = count + 1
        for ky in job_dict:

            try:
                dbUtil = dbConnection()
                cur = dbUtil.get_cursor()
                logger.debug("Updating user_id %d, job_id: %d" %
                             (key, job_dict[ky][9]))
                cur.execute(
                    "insert into sent_jobs values(?,?,datetime('now'))",
                    (key, job_dict[ky][9]))
                dbUtil.commit()
            except Exception, e:
                logger.error(e)
Beispiel #5
0
 def __init__(self):
     logger.info("initializing upwork crawler")
     #self.init_url="https://www.upwork.com"
     #self.init_url="https://www.upwork.com/o/jobs/browse/c/web-mobile-software-dev/"
     self.init_url="http://139.59.4.7:8080/"
     #self.chrome_options = webdriver.ChromeOptions()
     #self.chrome_options.add_argument("--headless") 
     #self.chrome_options.add_argument("--window-size=1437,760")
     #self.chrome_options.add_argument("--proxy-server=localhost:8080")
     #self.chrome_options.add_argument("--remote-debugging-port=9222")
     #self.chrome_options.add_argument('--no-sandbox')
     #get user agent
     self.dbUtil = dbConnection()
     self.cur = self.dbUtil.get_cursor()
     self.cur.execute("select user_agent from site where site_id=2")
     data=self.cur.fetchall()
     try:
         
         if data[0][0]==None or len(data[0][0])==0:
             print("generating agent")
             index=randint(0,len(agent_list)-1)
             #self.chrome_options.add_argument("--user-agent=%s" %agent_list[index])
             print(agent_list[index])
             self.cur.execute("update site set user_agent=? where site_id=2", (agent_list[index],))
             self.dbUtil.commit()
         else:
             agnt=""
             for agent in data:
                 agnt=agent[0]
             #self.chrome_options.add_argument("--user-agent=%s" %agnt)
         #self.driver = webdriver.Chrome(chrome_options=self.chrome_options)
     except IndexError,i:
         agnt=""
         for agent in data:
             agnt=agent[0]
             #self.chrome_options.add_argument("--user-agent=%s" %agnt)
         #self.driver = webdriver.Chrome(chrome_options=self.chrome_options)
         logger.error('No user agent specified')
Beispiel #6
0
 def __init__(self):
     logger.info("Initializing sequence")
     self.con = None
     self.dbUtil = dbConnection()
     self.cur = self.dbUtil.get_cursor()
     self.mesgBody = ''' 
     Hi Duncan,
     
         We found the following jobs that we Think might be Interesting to you.
         Please Check them out.
     
     Kind Regards,
     Duncan.
     '''
     self.waks = open("waks.csv", 'wb')
     self.exporter = CsvItemExporter(self.waks, unicode)
     self.exporter.start_exporting()
     try:
         self.cur.execute(
             "insert into site(site_id,name) values(1,'HomeWork Market')")
         self.dbUtil.commit()
     except:
         pass
Beispiel #7
0
class upwork:
    
    def __init__(self):
        logger.info("initializing upwork crawler")
        #self.init_url="https://www.upwork.com"
        #self.init_url="https://www.upwork.com/o/jobs/browse/c/web-mobile-software-dev/"
        self.init_url="http://139.59.4.7:8080/"
        #self.chrome_options = webdriver.ChromeOptions()
        #self.chrome_options.add_argument("--headless") 
        #self.chrome_options.add_argument("--window-size=1437,760")
        #self.chrome_options.add_argument("--proxy-server=localhost:8080")
        #self.chrome_options.add_argument("--remote-debugging-port=9222")
        #self.chrome_options.add_argument('--no-sandbox')
        #get user agent
        self.dbUtil = dbConnection()
        self.cur = self.dbUtil.get_cursor()
        self.cur.execute("select user_agent from site where site_id=2")
        data=self.cur.fetchall()
        try:
            
            if data[0][0]==None or len(data[0][0])==0:
                print("generating agent")
                index=randint(0,len(agent_list)-1)
                #self.chrome_options.add_argument("--user-agent=%s" %agent_list[index])
                print(agent_list[index])
                self.cur.execute("update site set user_agent=? where site_id=2", (agent_list[index],))
                self.dbUtil.commit()
            else:
                agnt=""
                for agent in data:
                    agnt=agent[0]
                #self.chrome_options.add_argument("--user-agent=%s" %agnt)
            #self.driver = webdriver.Chrome(chrome_options=self.chrome_options)
        except IndexError,i:
            agnt=""
            for agent in data:
                agnt=agent[0]
                #self.chrome_options.add_argument("--user-agent=%s" %agnt)
            #self.driver = webdriver.Chrome(chrome_options=self.chrome_options)
            logger.error('No user agent specified')
        self.driver = webdriver.Firefox()
        
        try:
            self.dbUtil = dbConnection()
            self.cur = self.dbUtil.get_cursor()
            print("Creating upwork db record")
            self.cur.execute("insert into site(site_id,name) select 2,'Upwork' WHERE NOT EXISTS(SELECT 1 FROM site WHERE site_id = 2 AND name = 'Upwork');  ")
            self.dbUtil.commit()
#             self.cur.execute("select name,value,group_key from cookie where site_id=2")
#             data = self.cur.fetchall()
#             self.dbUtil.commit()
#             logger.info("Cookies; The length: %d" %len(data))
#             if len(data)!=0:
#                 logger.info("cookies exist")
#                 group_set=set()
#                 for d in data:
#                     group_set.add(d[2])
#                 for g_key in group_set:
#                     cookies_dict={}
#                     for d in data:   
#                         if(d[2]==g_key):          
#                             cookies_dict[d[0]]=d[1]
#                     self.driver.add_cookie(cookies_dict)
                
        except Exception as e:
            logger.error(e, exc_info=True)
Beispiel #8
0
 def _reset_counter(self):
     self.dbUtil = dbConnection()
     self.cur = self.dbUtil.get_cursor()
     self.cur.execute("delete from retry_counter where site_id=2")
     self.cur.execute("insert into retry_counter(retry,site_id) values(0,2)")
     self.dbUtil.commit()