Ejemplo n.º 1
0
    def LoginGetValue(root):
        #login
        from selenium import webdriver
        from selenium.webdriver.common.keys import Keys
        driver = webdriver.PhantomJS()
        driver.get("https://secure.lme.com/Data/Community/Login.aspx")
        driver.find_element_by_id('_logIn__userID').send_keys("username")
        driver.find_element_by_id('_logIn__password').send_keys("password")
        driver.find_element_by_id('_logIn__logIn').click()
        #enter the page
        driver.find_element_by_id('_subMenu__dailyStocksPricesMetals').click()
        date = driver.find_element_by_xpath(
            "//*[@id='Table3']/tbody/tr[5]/td/table/tbody/tr[6]/td[1]").text

        Copper = driver.find_element_by_xpath(
            "//*[@id='Table3']/tbody/tr[5]/td/table/tbody/tr[7]/td[8]").text
        Aluminium = driver.find_element_by_xpath(
            "//*[@id='Table3']/tbody/tr[5]/td/table/tbody/tr[7]/td[6]").text
        Nickel = driver.find_element_by_xpath(
            "//*[@id='Table3']/tbody/tr[5]/td/table/tbody/tr[7]/td[12]").text
        Zinc = driver.find_element_by_xpath(
            "//*[@id='Table3']/tbody/tr[5]/td/table/tbody/tr[7]/td[16]").text
        #print date
        #print Copper, Aluminium, Nickel, Zinc
        date1 = date.encode("utf-8")

        dateConvert = ("%s-%s-%s" % (date1[11:], date1[8:10], date1[5:7]))
        #print dateConvert
        driver.quit()
        LMELogin = (dateConvert, Copper.encode('utf-8'),
                    Aluminium.encode('utf-8'), Nickel.encode('utf-8'),
                    Zinc.encode('utf-8'))
        return LMELogin
Ejemplo n.º 2
0
 def LoginGetValue(root):
     #login
     from selenium import webdriver
     from selenium.webdriver.common.keys import Keys
     driver = webdriver.PhantomJS()
     driver.get("https://secure.lme.com/Data/Community/Login.aspx")
     driver.find_element_by_id('_logIn__userID').send_keys("username")
     driver.find_element_by_id('_logIn__password').send_keys("password")
     driver.find_element_by_id('_logIn__logIn').click()
     #enter the page
     driver.find_element_by_id('_subMenu__dailyStocksPricesMetals').click()
     date = driver.find_element_by_xpath("//*[@id='Table3']/tbody/tr[5]/td/table/tbody/tr[6]/td[1]").text
     
     Copper = driver.find_element_by_xpath("//*[@id='Table3']/tbody/tr[5]/td/table/tbody/tr[7]/td[8]").text
     Aluminium = driver.find_element_by_xpath("//*[@id='Table3']/tbody/tr[5]/td/table/tbody/tr[7]/td[6]").text
     Nickel = driver.find_element_by_xpath("//*[@id='Table3']/tbody/tr[5]/td/table/tbody/tr[7]/td[12]").text
     Zinc = driver.find_element_by_xpath("//*[@id='Table3']/tbody/tr[5]/td/table/tbody/tr[7]/td[16]").text
     #print date
     #print Copper, Aluminium, Nickel, Zinc
     date1 = date.encode("utf-8")
     
     dateConvert = ("%s-%s-%s"%(date1[11:], date1[8:10], date1[5:7]))
     #print dateConvert
     driver.quit()
     LMELogin = (dateConvert, Copper.encode('utf-8'), Aluminium.encode('utf-8') ,Nickel.encode('utf-8'), Zinc.encode('utf-8'))
     return LMELogin
Ejemplo n.º 3
0
def getNews(url):
    try:
        res = requests.get(url)
        soup = BeautifulSoup(res.text,"lxml")
    
        title = soup.title.string
        #print "Title: "+title
		kinds = soup.find(attrs={"name":"section"})['content']
		#print "Kinds: "+kinds

        date = soup.find("time").text.strip()    
        date = date.replace(u"年","-")
        date = date.replace(u"月","-")
        date = date.replace(u"日","")
		if kinds != u"寵物動物":
			if date.find(':')!=-1:
					 date+=":00"
				else:
					 date+=" 00:00:00"
			date = date.encode('utf-8')
		date = datetime.strptime(date,"%Y-%m-%d %H:%M:%S")
Ejemplo n.º 4
0
def download_annonce_leboncoin(id):
    global poubelle

    print "Download annonce %d" % id
    appart_url = "http://www.leboncoin.fr/locations/%d.htm" % id

    request = urllib2.Request(appart_url, headers=headers)
    response = urllib2.urlopen(request)
    the_page = response.read()
    pool = BeautifulSoup(the_page)

    upload_by = pool.find("div", {"class": "upload_by"})
    auteur = upload_by.find("a").string
    if (
        auteur == " ancea "
        or auteur == " bonapart immobilier "
        or auteur == " allo location "
        or auteur == " casa immo "
        or "hestia" in auteur.lower()
    ):
        poubelle += 1
        return
    date = unicode(upload_by.contents[2].string).strip()[:-1]

    titre = pool.find("div", {"class": "header_adview"}).find("h2").string

    try:
        date = datetime.fromtimestamp(
            time.mktime(time.strptime("2013 " + date.encode("utf-8"), u"%Y le %d %B à %H:%M".encode("utf-8")))
        )
    except AttributeError:
        date = datetime.now()

    params = pool.find("div", {"class": "lbcParams"})
    loyer = int(re.sub(r"[^\d-]+", "", params.find("span", {"class": "price"}).string[:-2]))
    ville = params.find("th", text=re.compile("Ville")).parent.td.string
    cp = int(re.sub(r"[^\d-]+", "", params.find("th", text=re.compile("Code postal")).parent.td.string))

    try:
        # piece n'est pas un param obligatoire
        pieces_tag = params.find("th", text=re.compile(r"Pi.ces"))
        if pieces_tag:
            pieces = pieces_tag.parent.td.string
        else:
            pieces = None

        # meublé/non meublé n'est pas un param obligatoire
        meuble_tag = params.find("th", text=re.compile(r"Meubl."))
        if meuble_tag:
            meuble = unicode(meuble_tag.parent.td.string.strip()) == u"Meublé"
        else:
            meuble = None

        # la surface n'est pas un param obligatoire
        surface_tag = params.find("th", text=re.compile("Surface"))
        if surface_tag:
            surface = int(re.sub(r"[^\d-]+", "", surface_tag.parent.td.contents[0]))
        else:
            surface = None
    except AttributeError:
        print "Scraping problem"

    description = unicode(pool.find("div", {"class": "content"}))

    # cette méthode choppe les photos dans le code du carrousel (dispo quand ya plusieurs photos)
    photos = re.findall(r"aImages\[\d\] = \"(http://.*)\";", the_page)
    if not photos:
        # si 0 ou 1 photo alors pas de carrousel, essayons autrement
        image_tag = pool.find("a", {"id": "image"})
        if image_tag:
            # ya 1 photo
            photos = re.findall(r"(http://.*\.jpg)", image_tag["style"])
    for photo in photos:
        photo_jobs.add(download_photo, (photo, appart_url))

    appart = Appartement(
        id, titre, loyer, ville, cp, pieces, meuble, surface, description, photos, date, auteur, "leboncoin", appart_url
    )
    try:
        Session.add(appart)
        Session.commit()
    except IntegrityError:
        print "Got integrity error while trying to add %d %s" % (id, appart)

    time.sleep(1)
Ejemplo n.º 5
0
def download_annonce_leboncoin(id):
    global poubelle

    print "Download annonce %d" % id
    appart_url = "http://www.leboncoin.fr/locations/%d.htm" % id

    request = urllib2.Request(
        appart_url, headers=headers)
    response = urllib2.urlopen(request)
    the_page = response.read()
    pool = BeautifulSoup(the_page)

    upload_by = pool.find("div", {"class": "upload_by"})
    auteur = upload_by.find("a").string
    if auteur == " ancea " or auteur == " bonapart immobilier " or auteur == " allo location "\
    or auteur == " casa immo " or "hestia" in auteur.lower():
        poubelle += 1
        return
    date = unicode(upload_by.contents[2].string).strip()[:-1]

    titre = pool.find("div", {"class": "header_adview"}).find("h2").string

    try:
        date = datetime.fromtimestamp(
            time.mktime(time.strptime("2013 " +
                                      date.encode("utf-8"),
                                      u"%Y le %d %B à %H:%M".encode("utf-8"))))
    except AttributeError:
        date = datetime.now()

    params = pool.find("div", {"class": "lbcParams"})
    loyer = int(re.sub(r'[^\d-]+', '', params.find("span", {"class": "price"}).string[:-2]))
    ville = params.find("th", text=re.compile("Ville")).parent.td.string
    cp = int(re.sub(r'[^\d-]+', '', params.find("th", text=re.compile("Code postal")).parent.td.string))

    try:
        #piece n'est pas un param obligatoire
        pieces_tag = params.find("th", text=re.compile(r"Pi.ces"))
        if pieces_tag:
            pieces = pieces_tag.parent.td.string
        else:
            pieces = None

        #meublé/non meublé n'est pas un param obligatoire
        meuble_tag = params.find("th", text=re.compile(r"Meubl."))
        if meuble_tag:
            meuble = (unicode(meuble_tag.parent.td.string.strip()) == u"Meublé")
        else:
            meuble = None

        #la surface n'est pas un param obligatoire
        surface_tag = params.find("th", text=re.compile("Surface"))
        if surface_tag:
            surface = int(re.sub(r'[^\d-]+', '', surface_tag.parent.td.contents[0]))
        else:
            surface = None
    except AttributeError:
        print "Scraping problem"

    description = unicode(pool.find("div", {"class": "content"}))

    # cette méthode choppe les photos dans le code du carrousel (dispo quand ya plusieurs photos)
    photos = re.findall(r"aImages\[\d\] = \"(http://.*)\";", the_page)
    if not photos:
        # si 0 ou 1 photo alors pas de carrousel, essayons autrement
        image_tag = pool.find("a", {"id": "image"})
        if image_tag:
            # ya 1 photo
            photos = re.findall(r"(http://.*\.jpg)", image_tag["style"])
    for photo in photos:
        photo_jobs.add(download_photo, (photo, appart_url))

    appart = Appartement(id, titre, loyer, ville, cp, pieces, meuble, surface, description, photos, date, auteur,
                         "leboncoin", appart_url)
    try:
        Session.add(appart)
        Session.commit()
    except IntegrityError:
        print "Got integrity error while trying to add %d %s" % (id, appart)

    time.sleep(1)
Ejemplo n.º 6
0
def extract_champs_xml(filename):	
	posts=[]
	print "    - fichier de posts: \""+filename+"\"..."
	file=open(filename,"r")
	line = file.readline()
	newpost=0
	while "</posts>" not in line:
		if (not (len(posts))%50) and (len(posts)>0):
			print "     [#"+str(len(posts)) +"]"
			
		if newpost==0:
			sline = line.split()
			if len(sline)>0:
				if (sline[0]=="<post>"):
					newpost=1
				else:
					line = file.readline()			
					
		if newpost==1:
			current = file.readline()
			while not "date" in current:
				current = file.readline()
			date=current[8:16]
			date = date[:4] + '-' + date[4:6] + '-' + date[6:] + ' '
			
			file.readline()
			current = file.readline()
			categ1 = current[10:-8]
			
			current = file.readline()
			categ2 = current[10:-8]
			
			current = file.readline()
			categ3 = current[10:-8]
			
			file.readline()
			file.readline()
			file.readline()
			file.readline()
			current= file.readline()
			permalink  = unicode(current[13:-13],'utf-8')
			file.readline()
			current= file.readline()
			website = unicode(current[8:-8],'utf-8')
			finished=0
			content=''
			title = ''
			while finished == 0:
				contentline=file.readline()
				scontentline=contentline.split('>')
				if len(scontentline)>0: 
					if scontentline[0]=="  <title": 
						finished=1
						u=unicode(contentline[9:-9],'utf-8')
						title = u.encode("latin-1")
						content = title + ' . ' + content.encode('utf-8') 
				#		print "title "+ title
				if finished==0: content+=unicode(contentline,'utf-8')
			
			content = content.decode('utf-8','replace')
			#title = title.decode('utf-8','replace')
			content=content.replace("\n"," ").replace("\r"," ").replace("<b>content: </b>       ","").replace("      <div>","")
			print "title " + title
			if verbose>1: print "    - content:",content
			contentclean=specialutf8(cleancontent(content.replace("</span>"," ").replace("</hr>"," ").replace("</li>"," ").replace("</a>"," ").replace("</br>"," ").replace("</div>"," ").replace("</p>"," ").replace("<hr />"," ").replace("</h1>"," ").replace("</h3>"," ").replace("</h4>"," ").replace("</h5>"," ").replace("</img>"," ")))
			#title= specialutf8(title)
			if verbose>1: print "    - clean content:",contentclean
			contentanchor=detecthref(content)			
			if verbose>1: print "    - anchor content:",contentanchor
#			title =title.encode('utf-8','replace')
			content =content.encode('utf-8','replace')
			contentclean =contentclean.encode('utf-8','replace')
			date=date.encode('utf-8','replace')
			permalink = permalink.encode('utf-8','replace')
			categ1=categ1.encode('utf-8','replace')
			categ2=categ2.encode('utf-8','replace')
			categ3=categ3.encode('utf-8','replace')
			contentanchor =contentanchor.encode('utf-8','replace')
			#posts.append([title,date,permalink,website,categ1,categ2,categ3,content,contentclean,contentanchor])#avec le html brut
			posts.append([title,date,permalink,website,categ1,categ2,categ3,contentclean,contentanchor])#sans le html brut
			newpost = 0
			line = current
		else: 
			line = file.readline()			
	file.close()

	print "---",len(posts),"posts processed."
#	print "    - such as: ",posts[0]
	return posts
Ejemplo n.º 7
0
def extract_champs_html(filename):	
	posts=[]
	print "    - fichier de posts: \""+filename+"\"..."
#	file = open(filename,"r")
	file=codecs.open(filename,"r","utf8")
	line = file.readline()
	newpost=0
	while line != "":
		if (not (len(posts))%50) and (len(posts)>0):
			print "     [#"+str(len(posts)) +"]"
		
		if newpost==0:
			sline = line.split()
			if len(sline)>0:
				if (sline[0]=="<div><b>title:"):
					newpost=1
		if newpost==1:
			#print "--- new post"
			title=file.readline()[12:-14]			
			#print title.encode('utf-8','replace')
			if verbose>0:
				 print "      - post title:",title
			current = file.readline()
			while not "date" in current:
				current = file.readline()
			date=file.readline()[12:-14]
			#print "    - date:",date
			file.readline()
			file.readline() #lf : IGNORED
			file.readline()
			file.readline() #accuracy : IGNORED
			file.readline()
			permalink=file.readline()[12:-14]
			if verbose>2: print "    - permalink:",permalink
			file.readline()
			website=file.readline()[12:-14]
			if verbose>2: print "    - website:",website
			file.readline()
			author=file.readline()[12:-14]
			if verbose>2: print "    - author:",author
			file.readline()
			file.readline() #query: IGNORED
			cur_ligne = file.readline()
			while not "categori" in cur_ligne:
				cur_ligne = file.readline()
			category=file.readline()[12:-14]
			if verbose>2: print "    - category:",category
			categs = category.split('>')
			#now is the content itself
			finished=0
			content=""
			while finished == 0:
				contentline=file.readline()
				if contentline=="": 
					finished=1
					line=""
				scontentline=contentline.split()
				if len(scontentline)>0: 
					if scontentline[0]=="<div><b>title:": finished=1
				if finished==0: content+=contentline
			content=content.replace("\n"," ").replace("\r"," ").replace("<b>content: </b>       ","").replace("      <div>","")
			categ1 = categs[0].replace(' ','')
			categ2 = categs[1].replace(' ','')
			categ3 = categs[2].replace(' ','')
			if verbose>1: print "    - content:",content
			contentclean=specialutf8(cleancontent(content.replace("</span>"," ").replace("</hr>"," ").replace("</li>"," ").replace("</a>"," ").replace("</br>"," ").replace("</div>"," ").replace("</p>"," ").replace("<hr />"," ").replace("</h1>"," ").replace("</h3>"," ").replace("</h4>"," ").replace("</h5>"," ").replace("</img>"," ")))
			title= specialutf8(title)
			if verbose>1: print "    - clean content:",contentclean
			contentanchor=detecthref(content)			
			if verbose>1: print "    - anchor content:",contentanchor
			title =title.encode('utf-8','replace')
			content =content.encode('utf-8','replace')
			contentclean =contentclean.encode('utf-8','replace')
			date=date.encode('utf-8','replace')
			permalink = permalink.encode('utf-8','replace')
			categ1=categ1.encode('utf-8','replace')
			categ2=categ2.encode('utf-8','replace')
			categ3=categ3.encode('utf-8','replace')
			contentanchor =contentanchor.encode('utf-8','replace')
			#posts.append([title,date,permalink,website,categ1,categ2,categ3,content,contentclean,contentanchor])#avec le html brut
			posts.append([title,date,permalink,website,categ1,categ2,categ3,contentclean,contentanchor])#sans le html brut
			newpost = 0
		else: 
			line = file.readline()
	file.close()

	print "---",len(posts),"posts processed."
#	print "    - such as: ",posts[0]
	return posts
Ejemplo n.º 8
0
def extract_champs_txt(filename):	
	posts=[]
	print "    - post file \""+filename+"\"..."
	file=codecs.open(filename,"r","utf8")
	line = " "
	newpost=0
	while not line =='':
		if not (len(posts))%50:
			print "     [#"+str(len(posts)) +"]"
		if newpost==0:
			line = file.readline()
			sline = line.split()
			if len(sline)>0:
				if (sline[0]=="title:"):
					newpost=1
			
		if newpost==1:
			title=line[7:-1]
			if verbose>0:
				 print "      - post title:",title
			cur_ligne = ''
			while not "date" in cur_ligne:
				cur_ligne = file.readline()
			date=cur_ligne[6:-1]
			if verbose>0:
				print "    - date:",date
			#file.readline()
			file.readline() #lf : IGNORED
			#file.readline()
			file.readline() #accuracy : IGNORED
			#file.readline()
			
			permalink=file.readline()[11:-1]
			if verbose>2: print "    - permalink:",permalink
			#file.readline()
			website=file.readline()[9:-1]
			if verbose>2: print "    - website:",website
			#file.readline()
			author=file.readline()[8:-1]
			if verbose>2: print "    - author:",author
			#file.readline()
			file.readline() #query: IGNORED
			#file.readline()
			cur_ligne = ''
			while not "categorie" in cur_ligne:
				cur_ligne = file.readline()
				#print cur_ligne
			category=cur_ligne[12:]
			if verbose>2: print "    - category:",category
			categs = category.split('>')
			#now is the content itself
			finished=0
			content=""
			while finished == 0:
				contentline=file.readline()
				if"========" in contentline:
					finished=1
				scontentline=contentline.split()
				if len(scontentline)>0: 
					if scontentline[0]=="title:": finished=1
				if finished==0: content+=contentline
			content=content.replace("\n"," ").replace("\r"," ").replace("content:","")
			categ1 = categs[0].replace(' ','')
			categ2 = categs[1].replace(' ','')
			categ3 = categs[2].replace(' ','')

			if verbose>1: print "    - content:",content
			contentclean=cleancontent(content.replace("</span>"," ").replace("</hr>"," ").replace("</li>"," ").replace("</a>"," ").replace("</br>"," ").replace("</div>"," ").replace("</p>"," ").replace("<hr />"," ").replace("</h1>"," ").replace("</h3>"," ").replace("</h4>"," ").replace("</h5>"," ").replace("</img>"," "))
			if verbose>1: print "    - clean content:",contentclean
			contentanchor=detecthref(content)			
			if verbose>1: print "    - anchor content:",contentanchor
			contentclean =contentclean.encode('utf-8','replace')
			title =title.encode('utf-8','replace')
			content =content.encode('utf-8','replace')
			date=date.encode('utf-8','replace')
			permalink = permalink.encode('utf-8','replace')
			categ1=categ1.encode('utf-8','replace')
			categ2=categ2.encode('utf-8','replace')
			categ3=categ3.encode('utf-8','replace')
			contentanchor =contentanchor.encode('utf-8','replace')
			#posts.append([title,date,permalink,website,categ1,categ2,categ3,content,contentclean,contentanchor])#avec le html brut
			posts.append([title,date,permalink,website,categ1,categ2,categ3,contentclean,contentanchor])#sans le html brut			
			newpost = 0
	file.close()

	print "---",len(posts),"posts processed."
#	print "    - such as: ",posts[0]
	return posts
Ejemplo n.º 9
0
def is_date_less_last_date(date, days_to_last_date):
    return datetime.strptime(date.encode('utf-8'), '%d %b %y').date() < get_last_date_to_parse(days_to_last_date)
Ejemplo n.º 10
0
def is_date_less_last_date(date, days_to_last_date):
    return datetime.strptime(
        date.encode('utf-8'),
        '%d %b %y').date() < get_last_date_to_parse(days_to_last_date)
Ejemplo n.º 11
0
        soup3 = BeautifulSoup(res3.text)
        da = soup3.find('table', {"class": "subpage_data spFirst"})
        da2 = da.findAll('tr')
        date = 'null'
        for i in range(0, len(da2)):
            da3 = da2[i].findAll('td')
            if (da3[0].text == 'USA' and da3[2].text == ''):
                date = da3[1].text
                break
        if (date == 'null'):
            for i in range(0, len(da2)):
                da3 = da2[i].findAll('td')
                if (da3[0].text == 'USA' and da3[2].text == ' (limited)'):
                    date = da3[1].text
                    break
        data = date.encode('utf-8')

        #-------------------price
        res2 = requests.get(line + 'business?ref_=tt_dt_bus')
        soup2 = BeautifulSoup(res2.text)
        pr = soup2.find('div', {"id": "tn15content"})
        pr2 = pr.text.replace('\n', ' ').split(' ')
        cr = 0
        j = 0
        price = []
        pdate = []
        punit = []
        cc = pr.text.split('\n')
        for i in range(0, len(cc)):
            if (cc[i] == 'Weekend Gross'):
                cr = 1
#LME Price

driver.get("https://secure.lme.com/Data/Community/Login.aspx")
driver.find_element_by_id('_logIn__userID').send_keys("USERNAME")
driver.find_element_by_id('_logIn__password').send_keys("PSWORD")
driver.find_element_by_id('_logIn__logIn').click()
#enter the page
driver.find_element_by_id('_subMenu__dailyStocksPricesMetals').click()
date = driver.find_element_by_xpath("//*[@id='Table3']/tbody/tr[5]/td/table/tbody/tr[6]/td[1]").text
        
Copper = driver.find_element_by_xpath("//*[@id='Table3']/tbody/tr[5]/td/table/tbody/tr[7]/td[8]").text
Aluminium = driver.find_element_by_xpath("//*[@id='Table3']/tbody/tr[5]/td/table/tbody/tr[7]/td[6]").text
Nickel = driver.find_element_by_xpath("//*[@id='Table3']/tbody/tr[5]/td/table/tbody/tr[7]/td[12]").text
Zinc = driver.find_element_by_xpath("//*[@id='Table3']/tbody/tr[5]/td/table/tbody/tr[7]/td[16]").text

date1 = date.encode("utf-8")
        
dateConvert = ("%s-%s-%s"%(date1[11:], date1[8:10], date1[5:7]))
        
driver.quit()


LMEArr=[Copper.encode('utf-8'), Aluminium.encode('utf-8') ,Nickel.encode('utf-8'), Zinc.encode('utf-8')]


#Currency

CResultArr=[]
user = '******'
password = "******"
url = "http://www.ctci.com.tw/Acc_Rep/rate/rate.asp"