def scrap(folderBase, s): filePath = folderBase + '/' + 'yahoo_mb_' + s + '.txt' output = open(filePath, 'w') output.write('=========\n') output.write('Timestamp: ' + datetime.datetime.now().strftime(TIME_FORMAT) + '\n') output.write('=========\n') posters = driver.find_elements_by_xpath(xpath_poster) times = driver.find_elements_by_xpath(xpath_time) msgs = driver.find_elements_by_xpath(xpath_msg) print(len(posters), len(times), len(msgs)) try: for i in range(len(msgs)): try: soup = BeautifulSoup(msgs[i].text, 'html.parser').encode("utf-8") poster = posters[i].text time = times[i].text if not checkTime(s, time): break output.write(poster + ' @ ' + time + '\n') if soup.endswith('More'): output.write(soup[:-4]) else: output.write(soup + '\n') output.write('---------\n') except Exception as ex: pass finally: output.close() return filePath
def sendSMS(self,mobileno,text): if(self.notLogged): print("you are not logged in - call logIn()") return if(self.captchaNeeded): with open(self.captchaPath,"wb") as f: f.write(self.opener.open(self.captchaUrl).read()) p = Popen(["display",self.captchaPath]) self.dataDict['textcode'] = input("Captcha ? ") p.kill() if(len(text) <= 140): self.dataDict['mobNo'] = mobileno self.dataDict['text'] = text # print(self.postDataStr.format(**self.dataDict)) try: h = self.opener.open(self.sendSMSUrl,self.postDataStr.format(**self.dataDict).encode()) resp = h.read() try: msg = BS(resp).find("div",attrs={"id":"quicksms"}).find("div",attrs={"class":"quickname"}).text.strip() if msg.endswith("submitted successfully"): pass else: print("N : "+msg) except: print("N");self.captchaNeeded = True with open("successResp.html","wb") as f: f.write(resp) except urllib.error.HTTPError as error: pass
def clean_str(raw: Optional[str], strip_trailing_period: bool = False) -> Optional[str]: """ Takes a str and "cleans" it. Intended to be usable with short strings (names, titles) in any language. See scrub_text(), which extends this function for paragraph length and longer text fields. """ if not raw: return None text = ftfy.fix_text(raw) # remove HTML text = BeautifulSoup(text, "html.parser").get_text() # TODO: for performance, compile these as globals? # replaces whitespace with single space text = re.sub(r"\s+", " ", text).strip() # TODO: shouldn't HTML be parsing these out? text = text.replace("<em>", "").replace("</em>", "") text = text.strip() if strip_trailing_period and text.endswith("."): text = text[:-1] if text.lower() in UNWANTED_SHORT_STRINGS: return None if not text: return None return text
def scrape_brands(): brand_json = json.load(urlopen(US_BRANDS_URL)) for brand in brand_json['brands']: name_html = brand['name']['desktop'] name = BeautifulSoup(name_html).text.strip() # resolve HTML entities if name.endswith('*'): yield dict(brand=name[:-1], is_licensed=True) else: yield name
def get_progress(self, task_id): with AppAssureSession(self.server, self.port, self.username, self.password) as session: try: events = Events(session).taskMonitor(task_id).text percent = BeautifulSoup(events).td.td.text if not percent.endswith('%'): percent = "100%" return int(percent[:-1]) except AppAssureError as e: return e[1].text except (ValueError, AttributeError) as e: return str(e)
def scrape_company(): yield 'company', {'company': COMPANY, 'url': COMPANY_URL} brand_json = json.load(urlopen(US_BRANDS_JSON_URL)) for brand_dict in brand_json['brands']: brand = dict(company=COMPANY) name_html = brand_dict['name']['desktop'] name = BeautifulSoup(name_html).text.strip() # resolve HTML entities if name.endswith('*'): brand['brand'] = name[:-1] brand['is_licensed'] = True else: brand['brand'] = name if brand_dict['brand_url']: brand['url'] = urljoin(ALL_BRANDS_URL, brand_dict['brand_url']) yield 'brand', brand
sjAnswers = [[0 for x in range(6)] for y in range(5)] djAnswers = [[0 for x in range(6)] for y in range(5)] row = 0 col = 0 for square in sjDivs: text = square.find('td', class_='clue_text') if text: sjClues[row][col] = text.text answerDiv = square.find('div') if answerDiv: answer = extract.search(str(answerDiv)) pretty = BeautifulSoup(answer.group(1), 'html.parser').text if pretty.startswith('<i>'): pretty = pretty[3:] if pretty.endswith('</i>'): pretty = pretty[:-4] sjAnswers[row][col] = pretty col += 1 if col == 6: col = 0 row += 1 row = 0 col = 0 for square in djDivs: text = square.find('td', class_='clue_text') if text: djClues[row][col] = text.text answerDiv = square.find('div') if answerDiv:
def descargarCategoriaEspecifica22(URLLL, resultados): resultado = descargarResultado("/producto/" + URLLL , 360, 10); try: codigo = URLLL except: codigo = '' try: nombre = resultado.split('<h2 class="with-tabs">')[1].split('</h2>')[0].replace("\\t",'').strip() except: nombre = '' try: categoria = resultado.split('<b>Categor')[1].split('</div>')[0].split('</b>')[1].replace("\\t",'').replace("\\n",'').strip() except: categoria = '' try: costo = resultado.split('class="uc-price">')[2].split('<')[0].replace("\\t",'').strip() except: costo = '' try: fotos = 'http://www.radec.com.mx/sites/all/files/productos/' + codigo + '.jpg'; except: fotos = '' val = 0; nombre2 = nombre try: for car in resultado.split("/sites/all/themes/radec/images/car_icon.gif"): marca = '' marca_auto = '' modelo = '' anio = '' notas = '' if (val == 0): val = 1; else: try: marca_auto = car.split('<')[0].split('>')[1].replace("\\t",'').split('\\n')[2].strip() except: marca_auto = '' try: marca = '' if (' TYC ' in nombre): marca = 'TYC' if ( ' DEPO ' in nombre): marca = 'DEPO' except: marca = '' try: modelo = car.split('<')[0].split('>')[1].replace("\\t",'').split('\\n')[3].strip() except: modelo = '' anio = car.split('<')[0].split('>')[1].replace("\\t",'').split('\\n')[5].strip() if (anio != 'ALL YEARS'): anioOrigin2 = '#'+anio; anioOrigin = anioOrigin2.replace('#20','').replace('#19','').replace('-20','-').replace('-19','-') anioList = []; if ('-' in anio): anioInicio = int(anio.split('-')[0]) anioFin = int(anio.split('-')[1] ) while (anioInicio <= anioFin): anioList.append(str(anioInicio)) anioInicio = anioInicio + 1; anio = ' '.join(anioList) + ' ' if (len(anioList) < 5): nombre = nombre.replace(anioOrigin,anio); else: nombre = nombre.replace(anioOrigin,anioOrigin2.replace('#','').replace('-',' a ')); try: notas = resultado.split('<b>Aplicaciones:</b>')[1].split('</div>')[0].replace("\\t",'').replace("\\n",'').replace('<br/>',' - ') notas = BeautifulSoup(notas, 'html.parser').text; while (" " in notas): notas = notas.replace(' ',' '); if (notas.startswith(' - ')): notas = notas.replace(" - ", "", 1) if (notas.endswith(' - ')): notas = rreplace(notas," - ", "", 1); except: notas = '' nombre= nombre.replace(' FD ', ' FORD ').replace(' CV ', ' CHEVROLET ').replace(' TY ', ' TOYOTA ').replace(' AD ', ' AUDI ').replace(' BK ', ' BUICK ').replace(' MC ', ' MERCEDES BENZ ').replace(' ST ', ' SEAT ').replace(' VW ', ' VOLKSWAGEN ').replace(' KI ', ' KIA ').replace(' NS ', ' NISSAN ').replace(' HD ', ' HONDA ').replace(' SN ',' SATURN ').replace(' JP ', ' JEEP ').replace(' AC ', ' ACURA ').replace(' DG ', ' DODGE ').replace(' PT ',' PONTIAC ').replace(' BW ', ' BMW ').replace(' CR ', ' CHRYSLER ').replace(' MT ', ' MITSUBISHI ').replace(' PG ',' PEUGEOT ').replace(' UNIV ', ' UNIVERSAL ').replace(' CR ', ' CHRYSLER ').replace(' MT ', ' MITSUBISHI ').replace(' PG ',' PEUGEOT ') nombre= nombre.replace(' JGO ', ' JUEGO ').replace(' CD ', ' CADILLAC ') resultados.append('"'+codigo+'","'+nombre +'","'+ marca +'","'+ marca_auto +'","'+ categoria +'","'+costo +'","' + modelo +'","'+ fotos+'","'+ anio +'","'+ notas +'"'); except Exception as e: print('FALLO ---- > ' + URLLL) resultados.append('"'+URLLL+'"'); return;
def getRound(soup, id, gameId, round): div = soup.find('div', id=id) if div: categories = div.find_all('td', class_="category_name") # f = open("soup.txt","w") # f.write(soup.prettify()) # f.close() if len(categories) == 6: categories = list(map(getText, categories)) clueDivs = div.find_all('td', class_='clue') else: categories = soup.find_all('td', class_="category_name") clueDivs = soup.find_all('td', class_='clue') if round == 'Single': categories = categories[:6] clueDivs = clueDivs[:6] elif len(categories) >= 12: categories = categories[6:12] clueDivs = clueDivs[6:12] else: categories = [] clueDivs = [] categories = list(map(getText, categories)) if len(categories) > 0: clues = [[0 for x in range(6)] for y in range(5)] answers = [[0 for x in range(6)] for y in range(5)] extract = re.compile('correct_response">(.*)</em>') row = 0 col = 0 numClues = 0 for square in clueDivs: text = square.find('td', class_='clue_text') if text: clues[row][col] = text.text numClues += 1 answerDiv = square.find('div') if answerDiv: answer = extract.search(str(answerDiv)) pretty = BeautifulSoup(answer.group(1), 'html.parser').text if pretty.startswith('<i>'): pretty = pretty[3:] if pretty.endswith('</i>'): pretty = pretty[:-4] answers[row][col] = pretty col += 1 if col == 6: col = 0 row += 1 for col in range(6): sql = "INSERT INTO Categories (GameId, RoundCode, Name) VALUES (%s, %s, %s)" val = (gameId, round, categories[col]) mycursor.execute(sql, val) mydb.commit() categoryId = mycursor.lastrowid sql = "Insert Into Clues (Categoryid, PointVal, Clue, Answer) Values (%s, %s, %s, %s)" val = [] for row in range(5): val.append((categoryId, row * 200 + 200, clues[row][col], answers[row][col])) mycursor.executemany(sql, val) mydb.commit() print('\t', round, ': ', numClues) else: print('\t', round, ': no clues')