Example #1
0
def scrap(folderBase, s):
    filePath = folderBase + '/' + 'yahoo_mb_' + s + '.txt'
    output = open(filePath, 'w')
    output.write('=========\n')
    output.write('Timestamp: ' +
                 datetime.datetime.now().strftime(TIME_FORMAT) + '\n')
    output.write('=========\n')
    posters = driver.find_elements_by_xpath(xpath_poster)
    times = driver.find_elements_by_xpath(xpath_time)
    msgs = driver.find_elements_by_xpath(xpath_msg)

    print(len(posters), len(times), len(msgs))

    try:
        for i in range(len(msgs)):
            try:
                soup = BeautifulSoup(msgs[i].text,
                                     'html.parser').encode("utf-8")
                poster = posters[i].text
                time = times[i].text

                if not checkTime(s, time):
                    break

                output.write(poster + ' @ ' + time + '\n')
                if soup.endswith('More'):
                    output.write(soup[:-4])
                else:
                    output.write(soup + '\n')
                output.write('---------\n')
            except Exception as ex:
                pass
    finally:
        output.close()
    return filePath
Example #2
0
	def sendSMS(self,mobileno,text):
		if(self.notLogged):
			print("you are not logged in - call logIn()")
			return
		if(self.captchaNeeded):
			with open(self.captchaPath,"wb") as f:
				f.write(self.opener.open(self.captchaUrl).read())
			p = Popen(["display",self.captchaPath])
			self.dataDict['textcode'] = input("Captcha ? ")
			p.kill()
		if(len(text) <= 140):
			self.dataDict['mobNo'] = mobileno
			self.dataDict['text'] = text
			# print(self.postDataStr.format(**self.dataDict))
			try:
				h = self.opener.open(self.sendSMSUrl,self.postDataStr.format(**self.dataDict).encode())
				resp = h.read()
				try:
					msg = BS(resp).find("div",attrs={"id":"quicksms"}).find("div",attrs={"class":"quickname"}).text.strip()
					if msg.endswith("submitted successfully"): pass
					else: print("N : "+msg)
				except:
					print("N");self.captchaNeeded = True
				with open("successResp.html","wb") as f: f.write(resp)
			except urllib.error.HTTPError as error:
				pass
Example #3
0
def clean_str(raw: Optional[str],
              strip_trailing_period: bool = False) -> Optional[str]:
    """
    Takes a str and "cleans" it. Intended to be usable with short strings
    (names, titles) in any language. See scrub_text(), which extends this
    function for paragraph length and longer text fields.
    """
    if not raw:
        return None

    text = ftfy.fix_text(raw)

    # remove HTML
    text = BeautifulSoup(text, "html.parser").get_text()

    # TODO: for performance, compile these as globals?
    # replaces whitespace with single space
    text = re.sub(r"\s+", " ", text).strip()

    # TODO: shouldn't HTML be parsing these out?
    text = text.replace("<em>", "").replace("</em>", "")

    text = text.strip()

    if strip_trailing_period and text.endswith("."):
        text = text[:-1]

    if text.lower() in UNWANTED_SHORT_STRINGS:
        return None

    if not text:
        return None
    return text
Example #4
0
def scrape_brands():

    brand_json = json.load(urlopen(US_BRANDS_URL))

    for brand in brand_json['brands']:
        name_html = brand['name']['desktop']
        name = BeautifulSoup(name_html).text.strip()  # resolve HTML entities
        if name.endswith('*'):
            yield dict(brand=name[:-1], is_licensed=True)
        else:
            yield name
Example #5
0
 def get_progress(self, task_id):
     with AppAssureSession(self.server, self.port, self.username,
             self.password) as session:
         try:
             events = Events(session).taskMonitor(task_id).text
             percent = BeautifulSoup(events).td.td.text
             if not percent.endswith('%'):
                 percent = "100%"
             return int(percent[:-1])
         except AppAssureError as e:
             return e[1].text
         except (ValueError, AttributeError) as e:
             return str(e)
Example #6
0
 def get_progress(self, task_id):
     with AppAssureSession(self.server, self.port, self.username,
                           self.password) as session:
         try:
             events = Events(session).taskMonitor(task_id).text
             percent = BeautifulSoup(events).td.td.text
             if not percent.endswith('%'):
                 percent = "100%"
             return int(percent[:-1])
         except AppAssureError as e:
             return e[1].text
         except (ValueError, AttributeError) as e:
             return str(e)
Example #7
0
def scrape_company():

    yield 'company', {'company': COMPANY, 'url': COMPANY_URL}

    brand_json = json.load(urlopen(US_BRANDS_JSON_URL))

    for brand_dict in brand_json['brands']:
        brand = dict(company=COMPANY)

        name_html = brand_dict['name']['desktop']
        name = BeautifulSoup(name_html).text.strip()  # resolve HTML entities
        if name.endswith('*'):
            brand['brand'] = name[:-1]
            brand['is_licensed'] = True
        else:
            brand['brand'] = name

        if brand_dict['brand_url']:
            brand['url'] = urljoin(ALL_BRANDS_URL, brand_dict['brand_url'])

        yield 'brand', brand
Example #8
0
sjAnswers = [[0 for x in range(6)] for y in range(5)]
djAnswers = [[0 for x in range(6)] for y in range(5)]
row = 0
col = 0

for square in sjDivs:
    text = square.find('td', class_='clue_text')
    if text:
        sjClues[row][col] = text.text
    answerDiv = square.find('div')
    if answerDiv:
        answer = extract.search(str(answerDiv))
        pretty = BeautifulSoup(answer.group(1), 'html.parser').text
        if pretty.startswith('<i>'):
            pretty = pretty[3:]
        if pretty.endswith('</i>'):
            pretty = pretty[:-4]
        sjAnswers[row][col] = pretty
    col += 1
    if col == 6:
        col = 0
        row += 1

row = 0
col = 0
for square in djDivs:
    text = square.find('td', class_='clue_text')
    if text:
        djClues[row][col] = text.text
    answerDiv = square.find('div')
    if answerDiv:
def descargarCategoriaEspecifica22(URLLL, resultados):
	resultado = descargarResultado("/producto/" +  URLLL , 360, 10);


	try: 
		codigo = URLLL
	except:
		codigo = ''

	try: 
		nombre = resultado.split('<h2 class="with-tabs">')[1].split('</h2>')[0].replace("\\t",'').strip()
	except:
		nombre = ''
	
	try:	
		categoria = resultado.split('<b>Categor')[1].split('</div>')[0].split('</b>')[1].replace("\\t",'').replace("\\n",'').strip()
	except:
		categoria = ''

	try:	
		costo = resultado.split('class="uc-price">')[2].split('<')[0].replace("\\t",'').strip()
	except:
		costo = ''

	try:
		fotos = 'http://www.radec.com.mx/sites/all/files/productos/' + codigo + '.jpg';
	except:
		fotos = ''

	val = 0;

	nombre2 = nombre

	try:
		for car in resultado.split("/sites/all/themes/radec/images/car_icon.gif"):


			marca = ''
			marca_auto = ''
			modelo = ''
			anio = ''
			notas = ''

			if (val == 0):
				val = 1;
			else:
				try:
					marca_auto = car.split('<')[0].split('>')[1].replace("\\t",'').split('\\n')[2].strip()
				except:
					marca_auto = ''


				try:
					marca = ''

					if (' TYC ' in nombre):
						marca = 'TYC'
					
					if ( ' DEPO ' in nombre):
						marca = 'DEPO' 
				except:
					marca = ''

				try:				
					modelo = car.split('<')[0].split('>')[1].replace("\\t",'').split('\\n')[3].strip()
				except:
					modelo = ''



				anio = car.split('<')[0].split('>')[1].replace("\\t",'').split('\\n')[5].strip()


				if (anio != 'ALL YEARS'):
					anioOrigin2 = '#'+anio;
					anioOrigin = anioOrigin2.replace('#20','').replace('#19','').replace('-20','-').replace('-19','-')
					anioList = [];
	

					if ('-' in anio):
						
						anioInicio = int(anio.split('-')[0])
						anioFin = int(anio.split('-')[1] )

						while (anioInicio <= anioFin):
							anioList.append(str(anioInicio))
							anioInicio = anioInicio + 1;
	
						anio = ' '.join(anioList) + ' '
					
				
					if (len(anioList) < 5):
						nombre = nombre.replace(anioOrigin,anio);
					else:
						nombre = nombre.replace(anioOrigin,anioOrigin2.replace('#','').replace('-',' a '));

		

				try:	
					notas = resultado.split('<b>Aplicaciones:</b>')[1].split('</div>')[0].replace("\\t",'').replace("\\n",'').replace('<br/>',' - ')

					notas = BeautifulSoup(notas, 'html.parser').text;

					while ("  " in notas):
						notas = notas.replace('  ',' ');

					if (notas.startswith(' - ')):
						notas = notas.replace(" - ", "", 1)

					if (notas.endswith(' - ')):
						notas = rreplace(notas," - ", "", 1);

				except:
					notas = ''



				nombre= nombre.replace(' FD ', ' FORD ').replace(' CV ', ' CHEVROLET ').replace(' TY ', ' TOYOTA ').replace(' AD ', ' AUDI ').replace(' BK ', ' BUICK ').replace(' MC ', ' MERCEDES BENZ ').replace(' ST ', ' SEAT ').replace(' VW ', ' VOLKSWAGEN ').replace(' KI ', ' KIA ').replace(' NS ', ' NISSAN ').replace(' HD ', ' HONDA ').replace(' SN ',' SATURN ').replace(' JP ', ' JEEP ').replace(' AC ', ' ACURA ').replace(' DG ', ' DODGE ').replace(' PT ',' PONTIAC ').replace(' BW ', ' BMW ').replace(' CR ', ' CHRYSLER ').replace(' MT ', ' MITSUBISHI ').replace(' PG ',' PEUGEOT ').replace(' UNIV ', ' UNIVERSAL ').replace(' CR ', ' CHRYSLER ').replace(' MT ', ' MITSUBISHI ').replace(' PG ',' PEUGEOT ')
				nombre= nombre.replace(' JGO ', ' JUEGO ').replace(' CD ', ' CADILLAC ')

		resultados.append('"'+codigo+'","'+nombre +'","'+ marca +'","'+ marca_auto +'","'+ categoria +'","'+costo +'","' + modelo +'","'+ fotos+'","'+ anio +'","'+ notas +'"');
	except Exception as e:
		print('FALLO ---- > ' + URLLL)
		resultados.append('"'+URLLL+'"');
	
	return;
Example #10
0
def getRound(soup, id, gameId, round):
    div = soup.find('div', id=id)
    if div:
        categories = div.find_all('td', class_="category_name")
        # f = open("soup.txt","w")
        # f.write(soup.prettify())
        # f.close()
        if len(categories) == 6:
            categories = list(map(getText, categories))
            clueDivs = div.find_all('td', class_='clue')
        else:
            categories = soup.find_all('td', class_="category_name")
            clueDivs = soup.find_all('td', class_='clue')
            if round == 'Single':
                categories = categories[:6]
                clueDivs = clueDivs[:6]
            elif len(categories) >= 12:
                categories = categories[6:12]
                clueDivs = clueDivs[6:12]
            else:
                categories = []
                clueDivs = []
            categories = list(map(getText, categories))

        if len(categories) > 0:
            clues = [[0 for x in range(6)] for y in range(5)]
            answers = [[0 for x in range(6)] for y in range(5)]
            extract = re.compile('correct_response&quot;&gt;(.*)&lt;/em&gt;')

            row = 0
            col = 0
            numClues = 0

            for square in clueDivs:
                text = square.find('td', class_='clue_text')
                if text:
                    clues[row][col] = text.text
                    numClues += 1
                answerDiv = square.find('div')
                if answerDiv:
                    answer = extract.search(str(answerDiv))
                    pretty = BeautifulSoup(answer.group(1), 'html.parser').text
                    if pretty.startswith('<i>'):
                        pretty = pretty[3:]
                    if pretty.endswith('</i>'):
                        pretty = pretty[:-4]
                    answers[row][col] = pretty
                col += 1
                if col == 6:
                    col = 0
                    row += 1

            for col in range(6):
                sql = "INSERT INTO Categories (GameId, RoundCode, Name) VALUES (%s, %s, %s)"
                val = (gameId, round, categories[col])
                mycursor.execute(sql, val)
                mydb.commit()
                categoryId = mycursor.lastrowid

                sql = "Insert Into Clues (Categoryid, PointVal, Clue, Answer) Values (%s, %s, %s, %s)"
                val = []
                for row in range(5):
                    val.append((categoryId, row * 200 + 200, clues[row][col],
                                answers[row][col]))
                mycursor.executemany(sql, val)
                mydb.commit()

            print('\t', round, ': ', numClues)
        else:
            print('\t', round, ': no clues')