def download(update, context): global cur_file_counter if not ("https://vm.tiktok.com/" in update.message.text or "https://www.tiktok.com/" in update.message.text): return url_extractor = URLExtract() urls = url_extractor.find_urls(update.message.text) if len(urls) == 0: err = False if update.message.reply_to_message == None: err = True else: urls = url_extractor.find_urls(update.message.reply_to_message.text) if err or len(urls) == 0: update.message.reply_text( "Must either reply to a tiktok url or tell me a url" ) return url = urls[0] filename = f"{cur_file_counter}.mp4" logging.debug(f"Downloading from {url}...") context.bot.send_chat_action(chat_id=update.effective_message.chat_id, action=ChatAction.UPLOAD_VIDEO) err = download_to_file(url, filename) if err == 0: cur_file_counter += 1 update.message.reply_video(open(filename, "rb"), supports_streaming=True) elif err == 99999: update.message.reply_text( f"Video is too long, must be shorter than {MAX_VID_LENGTH} seconds" ) else: update.message.reply_text( f"Could not download, error {err} . Video has to be shorter than {MAX_VID_LENGTH} seconds. Tell @creikey" )
def str_clean3(self, item): extractor = URLExtract() if extractor.find_urls(item) != list(): url = extractor.find_urls(item)[0] url = tldextract.extract(url).registered_domain return url else: return ""
def str_clean1(self, item): extractor = URLExtract() if item.startswith("/url?q="): if extractor.find_urls(item) != list(): url = extractor.find_urls(item)[0] url = tldextract.extract(url).registered_domain return url else: return ""
def get_pages_urls(sitemap): extractor = URLExtract() pages = sitemap.split('\n') page = '<loc>https://www.qwerty.com/blog22.xml.gz</loc> <loc>https://www.qwerty.com/blog22.xml.gz</loc>' url = extractor.find_urls(page) # print('url: ', url) urls = [extractor.find_urls(k)[0] for k in pages if k.endswith('</loc>')] # print('urls: ', urls) return urls
def check_spreadsheet(schedule_sheet, spreadsheet_id, drive): today_weekday = datetime.date.today().weekday() current_hour = datetime.datetime.now().hour url_extractor = URLExtract() sheets_document = schedule_sheet.values().get(spreadsheetId=spreadsheet_id, range=RANGE_NAME, valueRenderOption='FORMULA').execute() posting_schedule = sheets_document.get('values', []) for index, posting_day_schedule in enumerate(posting_schedule): post_vk, post_tg, post_fb, publish_day, publish_time, \ post_text_link, post_image_link, is_published = posting_day_schedule is_today = publish_day == WEEKDAYS[today_weekday] is_now = publish_time == current_hour is_already_published = is_published.lower().strip() == 'нет' if not (is_today and is_now and is_already_published): continue text_url = url_extractor.find_urls(post_text_link)[0] text_file_id = urlparse(text_url).query[3:] image_url = url_extractor.find_urls(post_image_link)[0] image_file_id = urlparse(image_url).query[3:] with tempfile.NamedTemporaryFile() as image_tempfile, tempfile.NamedTemporaryFile() as text_tempfile: get_post_text(text_file_id, text_tempfile.name, drive) get_post_image(image_file_id, image_tempfile.name, drive) terminal_commands = ['python3', 'vk_tg_fb_posting.py', image_tempfile.name, text_tempfile.name] if post_vk.lower().strip() == 'да': terminal_commands.append('-pv') elif post_fb.lower().strip() == 'да': terminal_commands.append('-pf') elif post_tg.lower().strip() == 'да': terminal_commands.append('-pt') exit_code = subprocess.call(terminal_commands) if exit_code: print('Program "vk_tg_fb_posting.py" finished with exit code', exit_code) schedule_sheet.values().update( spreadsheetId=spreadsheet_id, range="'Лист1'!H{}".format(index + 3), body={'values': [['да'], ]}, valueInputOption='RAW').execute() print('Post was published successfully!')
def extract_urls(text): extractor = URLExtract() if type(text) is str: urls = extractor.find_urls(text) return urls elif type(text) is list: urls = [] for x in text: url_x = extractor.find_urls(x) urls.extend(url_x) return urls else: print("Provided text type (%s) is not currently supported. Please supply either a list of string objects or a string object." % str(type(text)))
def extract_urls(text): extractor = URLExtract() if type(text) is str: urls = extractor.find_urls(text) return urls elif type(text) is list: urls = [] for x in text: url_x = extractor.find_urls(x) urls.extend(url_x) return urls else: print( "Provided text type (%s) is not currently supported. Please supply either a list of string objects or a string object." % str(type(text)))
def extract_urls(path): text = textract.process(path) string = str(text, encoding='utf-8') extractor = URLExtract() urls = extractor.find_urls(string) return urls
def clean_url(doc): extractor = URLExtract() urls = list(set(extractor.find_urls(doc))) with_url = doc.split() remove_url = ' '.join(filter(lambda x: x not in urls, with_url)) remove_url = remove_url.lower() return remove_url
class CodeSharingCog(Cog): """A meta cog to inform admins abouts bot's status.""" _maybe_try = f'Maybe try {Hastebin.server} next time?\n' single_link_message = 'I see a paste link in your message. ' + _maybe_try multiple_links_message = 'I see multiple paste links in your message. ' + _maybe_try def __init__(self, bot): self.extractor = URLExtract() @Cog.listener() async def on_message(self, message): if message.author.bot: return hastebin_urls = [] urls = self.extractor.find_urls(message.content) for url in urls: if urlparse(url).netloc in Hastebin.converters.keys(): link = await Hastebin.from_link(url) hastebin_urls.append(link) if hastebin_urls: if len(hastebin_urls) == 1: msg = self.single_link_message + hastebin_urls[0] else: msg = self.multiple_links_message + '\n'.join(hastebin_urls) await message.channel.send(msg)
def extractIOC(path): extractor = URLExtract() try: out = execute_command('src\\strings64.exe ' + path) except: out = execute_command('src\\strings64.exe ' + path) out = out.decode("utf-8").split('\n') extract_url = [] ipv4 = [] ipv6 = [] emails = [] for url in iocextract.extract_urls(str(out), refang=True, strip=True): n = extractor.find_urls(url) try: n = n[0] n = str(n).replace("\\r", "") extract_url.append(n) except: pass extract_url = list(set(extract_url)) for ip4 in iocextract.extract_ipv4s(str(out), refang=True): ipv4.append(ip4) for ip6 in iocextract.extract_ipv6s(str(out)): ipv6.append(ip6) for email in iocextract.extract_emails(str(out), refang=True): emails.append(str(email).replace("\\r", "")) return (extract_url, ipv4, ipv6, emails)
def extract_URLs(content): if content is not None: print ("\n***** Extract URLs *****\n") ### Identify URLs in content ### extractor = URLExtract(); urls = extractor.find_urls(content) # returns list of urls #iocs = list(iocextract.extract_urls(content)) # another method for extracting urls print ("extractor.find method") print (urls) #print ("iocextract.extract_urls method") #print (iocs) info_to_evaluate = urls# + iocs index = 0 # Occassionally, the functions above return urls with trailing commas. Remove these. for url in info_to_evaluate: if url.endswith(','): info_to_evaluate[index] = url[:-1] index += 1 print ("Removed trailing commas") print (info_to_evaluate) print ("Successfully extracted URLs") return info_to_evaluate
def getImageLinks(url): # Load intermediate google ads website headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36' } result = requests.get(url, headers=headers) soup = BeautifulSoup(result.content, 'html.parser') scripts = soup.findAll("script") # Extract actual link to website text = scripts[len(scripts) - 1].get_text() extractor = URLExtract() urls = extractor.find_urls(text) actualURL = urls[0] result = requests.get(actualURL, headers=headers) soup = BeautifulSoup(result.content, 'html.parser') returnLink = list() for link in soup.findAll("a", {"class": "action-expand"}): returnLink.append(link.get('href')) return (returnLink)
def downloadImages(page): for i in range(1, page): print(i) driver.get( "view-source:https://pixabay.com/images/search/orchid%20flower/?pagi=" + str(i)) pageSource = driver.page_source soup = BeautifulSoup(pageSource, "html.parser") urls = soup.findAll("span", {"class": "html-attribute-value"}) extractor = URLExtract() a = 0 for url in urls: extractedUrls = extractor.find_urls(url.text) for extUrl in extractedUrls: if ".jpg" in extUrl or ".png" in extUrl: a += 1 imagelist.add(extUrl) for x in imagelist: print(x) print("----------------------------") print(len(imagelist)) time.sleep(3)
def test_dns_cache_init(): """Testing creating a new DNS caching resolver""" default_resolver = dns.resolver.get_default_resolver() assert default_resolver == dns.resolver.default_resolver if default_resolver: dns.resolver.default_resolver = None underscore_resolver = dns.resolver._resolver if underscore_resolver: dns.resolver._resolver = None urlextract = URLExtract() assert dns.resolver.default_resolver is None assert dns.resolver._resolver is None results = urlextract.find_urls("https://github.com", check_dns=True) assert len(results) == 1 if not ExceptionCachingResolver: assert dns.resolver.default_resolver is None resolver = dns.resolver._resolver assert resolver is not None assert resolver.cache is not None assert resolver.cache.data is not None assert len(resolver.cache.data) == 1
def test_dns_cache_reuse(): """Testing re-using an existing DNS caching resolver""" underscore_resolver = dns.resolver._resolver if underscore_resolver: dns.resolver._resolver = None default_resolver = dns.resolver.get_default_resolver() if ExceptionCachingResolver: assert default_resolver.__class__ == ExceptionCachingResolver assert default_resolver == dns.resolver.default_resolver cache = dns.resolver.LRUCache() default_resolver.cache = cache urlextract = URLExtract() assert dns.resolver._resolver is None results = urlextract.find_urls("https://github.com", check_dns=True) assert len(results) == 1 assert dns.resolver._resolver is not None assert dns.resolver.default_resolver == default_resolver assert dns.resolver.default_resolver.cache == cache assert default_resolver.cache.data is not None assert len(default_resolver.cache.data) == 1
def on_pubmsg(self, c, e): a = e.arguments[0].split(":", 1) if len(a) > 1: nick = e.source.nick extractor = URLExtract() urls = extractor.find_urls(a[1].strip()) db_path = "{}links.db".format(os.getenv("IRC_db_path", "./")) with Database(db_path) as db: for url in urls: db.execute( "INSERT INTO links (datetime, nick, url) VALUES" + " (?,?,?)", ( str(f"{datetime.datetime.now():%Y-%m-%d %H:%M:%S}" ), nick, url, ), ) print("{} {} posted {}".format( str(f"{datetime.datetime.now():%Y-%m-%d %H:%M:%S}"), nick, url, ))
async def on_member_join(self, member): extractor = URLExtract() urls = extractor.find_urls(f'{member.name}') if not urls: print('no urls found good to go') else: print(urls) await member.guild.kick(member)
def extract_queries(dic): """ The function receives a dictionary and returns the queries that are not urls """ extractor = URLExtract() for elem in dic.keys(): if extractor.find_urls(elem) == []: dic_all_queries[elem] = dic[elem]
def word_validator(self, docs, replace=''): extractor = URLExtract() urls = extractor.find_urls(docs) for url in urls: docs = docs.replace(url, '') docs = re.sub(r'[!-/]|[:-@]|[\[-`]|[\{-~]|[︰-@]|\n|[\u3000-\u3030]', '', docs) return docs
def parse_url(self, content): """ :param content: content of a file for parsin :return: list of parsed url from content """ extractor = URLExtract() url_list = extractor.find_urls(content, only_unique=True) return url_list
def remove_urls(text): extractor = URLExtract() urls = extractor.find_urls(text) for url in urls: text = text.replace(url, ' ') text = text.replace(' ', ' ').strip() # print(urls) return text
def getURLDomains(text): extractor = URLExtract() urls = extractor.find_urls(text) domains_list = [] for url in urls: domain = tldextract.extract(url).domain domains_list.append(domain) return domains_list
def get_band_disco(soup, current_records): # Instancia de URLExtract. extractor = URLExtract() # Abrimos sesión con la base de datos. session = Session() # Del objeto "soup" (el contenido será parecido a band_page.html) encuentra <div id="band_disco">. disco_finder = soup.find("div", {"id": "band_disco"}) # Los tags resultantes pasan a string. s_disco_finder = str(disco_finder) # Extrae todos los URLs presentes. disco_url = extractor.find_urls(s_disco_finder) # Toma el primer URL y asigna a una variable. url = disco_url[0] # Hace un request con dicho URL. r = requests.get(url) # Algo para los caracteres raros, por si los hay. r.encoding = 'utf-8' # Convierte el response en un objeto BeautifulSoup para su uso. disco_soup = BeautifulSoup(r.content, 'html.parser') # Del objeto "disco_soup" (el contenido será parecido a disco.html) obtiene todos los tags <tr>. disco_entries = disco_soup.find_all("tr") # Elimina el primero porque no se necesita. disco_entries.pop(0) # -> Por cada elemento en disco_entries: for item in disco_entries: # -> Instanciamos la discografía e insertamos. discography = fact.factory("discography") discography.band_id = current_records # -> Intentamos: try: # -> En un ciclo de x < 3: for x in range(3): # -> Busca todos los tags <td> usando el índice 'x'. s = item.find_all("td")[x] # -> Como en este caso los atributos de la discografía vienen en 3 partes, condicionamos: if x == 0: discography.name = str(s.getText()) if x == 1: discography.release_type = str(s.getText()) if x == 2: discography.year = str(s.getText()) # -> Agregamos el row. session.add(discography) # Guardamos cambios. session.commit() # Cerramos sesión. session.close() except: # En caso de que la banda no tenga releases sólo pasa al siguiente. session.close()
def download_links_in_html_as_attachments(self, p_html: str, p_extensions: List[str]): """ Scans the given HTML file, finds links, downloads the files and saves them as attachments. This method supports only text attachments at this time. """ # Build clean HTML if p_html is None or len(p_html) <= 0: return clean_html = p_html.replace("\r", "").replace("\n", "") html_tag_pos = clean_html.lower().find("<html") if html_tag_pos < 0: return clean_html = clean_html[html_tag_pos:] # Extract URL's extractor = URLExtract() urls = extractor.find_urls(clean_html) # Download as necessary for url in urls: low_url = url.lower() has_eligible_extension = False for extension in p_extensions: low_extension = "." + extension.lower() if low_extension in low_url: has_eligible_extension = True break if not has_eligible_extension: continue if "urldefense.com" in url: real_http_pos = low_url.rfind("http") clean_url = url[real_http_pos:].replace("__", "") else: clean_url = url if clean_url[-1] == "/": clean_url = clean_url[:-1] filename = os.path.basename(clean_url) dummy_name, extension = os.path.splitext(filename) extension = extension.replace(".", "") file_format = Attachment.guess_format_by_file_extension(extension) response = requests.get(clean_url, allow_redirects=True) if file_format == AttachmentFormat.text: downloaded_attachment = Attachment( p_name=filename, p_format=AttachmentFormat.text, p_text_content=response.text) else: downloaded_attachment = Attachment( p_name=filename, p_format=AttachmentFormat.binary, p_binary_content=response.content) self.attachments.append(downloaded_attachment)
def crawler(): # Instancia de URLExtract. extractor = URLExtract() # Parametros para iniciar el scraping en el DataTable. page = 1 display_start = 0 current_records = 0 # URL inicial. target_url = S.current_target_url(page, display_start) # Records totales y entradas de la primera página. total_records =1#get_total_records(target_url) json_data = S.get_json_data(target_url) # Mientras no se pase del total de records: while current_records < total_records: # -> Por cada 500 elementos: for x in range(500): # -> Condición para cuando no hay más que buscar. if current_records == total_records: break # -> Contador de records actuales. current_records += 1 # -> Castea los datos de las bandas desde el DT a string. # -> Busca el URL de su respectivo perfil. s_json_data = str(json_data["aaData"][x][0]) extracted_url = extractor.find_urls(s_json_data) # -> Se hace un get al URL extraído (perfil de la banda). r = requests.get(extracted_url[0]) # -> Si el request sale bien: if r.status_code == 200: soup = BeautifulSoup(r.content, 'html.parser') # -> Hacemos el parseo de los atributos junto con la inserción objeto -> BD. S.get_band_attributes(soup) S.get_band_disco(soup, current_records) S.get_band_members(soup, current_records) # -> 2 segundos de espera por ciclo. time.sleep(2) # Cada 500 entradas: # -> Cambia de pagina en DataTable. page += 1 # -> Muestra las siguientes 500 bandas. display_start += 500 # Actualización de URL inicial. target_url = S.current_target_url(page, display_start) # Actualización de json inicial. json_data = S.get_json_data(target_url)
def get_urls(body) -> List[str]: # Set up an extractor to extract the URLs extractor = URLExtract() # Update the TLD list if it is older than x days extractor.update_when_older(7) # Run the extractor and remove any duplicates urls = extractor.find_urls(body, only_unique=True) return urls
def word_validator(docs, replace='URL'): extractor = URLExtract() urls = extractor.find_urls(docs) for url in urls: docs = docs.replace(url, replace) # docs = re.sub(r'[︰-@]|\n|[\u3000-\u3030]', "", docs) return docs
def retrieve_doi(url): dois = [] with urllib.request.urlopen(url) as url: s = url.read().decode('utf-8') text = s.replace(' ', '').replace('=', '') extractor = URLExtract() urls = extractor.find_urls(text) dois = find_doi_string_regex(urls) return dois
def replace_url(s): extractor = URLExtract() if extractor.has_urls(s): urls = extractor.find_urls(s, only_unique=True) for url in urls: s = s.replace(url, "<url>") return s
def initialize(self): self.logNotify("Initializing OpenSelery") # return openselery version selerypath = os.path.realpath(__file__) # return openselery version self.log("OpenSelery HEAD sha [%s]" % git_utils.get_head_sha(selerypath)) self.log("OpenSelery last tag [%s]" % git_utils.get_lastest_tag(selerypath)) # initialize config dict with default from template self.log("Preparing Configuration") self.config = OpenSeleryConfig() # parse args self.log("Parsing arguments") args = self.parseArgs() # apply args dict to config self.config.apply(vars(args).items()) # apply yaml config to our configuration if possible self.log("Loading configuration [%s]" % self.config.config_path) self.loadYaml(self.config.config_path) # load our readme file extractor = URLExtract() fundingPath = self._getFile("README.md") if fundingPath is not None: self.log("Loading funding file [%s] for bitcoin wallet" % fundingPath) mdfile = open('README.md', 'r') mdstring = mdfile.read() urls = extractor.find_urls(mdstring) badge_string = "https://en.cryptobadges.io/donate/" for url in urls: if badge_string in url: self.config.bitcoin_address = url.split(badge_string, 1)[1] self.log("Found bitcoin address [%s]" % self.config.bitcoin_address) else: self.log( "Using bitcoin address from configuration file for validation check [%s]" % self.config.bitcoin_address) # load tooling url if self.config.include_tooling_and_runtime and self.config.tooling_path: with open(self.config.tooling_path) as f: self.config.toolrepos = yaml.safe_load(f) if self.config.toolrepos is not None: self.log("Tooling file loaded [%s]" % self.config.toolrepos) else: self.log("No tooling urls found") else: self.log("Tooling not included") # load our environment variables self.loadEnv() self.logNotify("Initialized") self.log(str(self.getConfig()))
def extract_urls(text): extractor = URLExtract() urls = extractor.find_urls(text) return urls