Ejemplo n.º 1
0
def download(update, context):
    global cur_file_counter
    if not ("https://vm.tiktok.com/" in update.message.text or "https://www.tiktok.com/" in update.message.text):
        return
    url_extractor = URLExtract()
    urls = url_extractor.find_urls(update.message.text)
    if len(urls) == 0:
        err = False
        if update.message.reply_to_message == None:
            err = True
        else:
            urls = url_extractor.find_urls(update.message.reply_to_message.text)
        if err or len(urls) == 0:
            update.message.reply_text(
                "Must either reply to a tiktok url or tell me a url"
            )
            return
    url = urls[0]
    filename = f"{cur_file_counter}.mp4"
    logging.debug(f"Downloading from {url}...")
    context.bot.send_chat_action(chat_id=update.effective_message.chat_id, action=ChatAction.UPLOAD_VIDEO)

    err = download_to_file(url, filename)
    if err == 0:
        cur_file_counter += 1
        update.message.reply_video(open(filename, "rb"), supports_streaming=True)
    elif err == 99999:
        update.message.reply_text(
            f"Video is too long, must be shorter than {MAX_VID_LENGTH} seconds"
        )
    else:
        update.message.reply_text(
            f"Could not download, error {err} . Video has to be shorter than {MAX_VID_LENGTH} seconds. Tell @creikey"
        )
Ejemplo n.º 2
0
 def str_clean3(self, item):
     extractor = URLExtract()
     if extractor.find_urls(item) != list():
         url = extractor.find_urls(item)[0]
         url = tldextract.extract(url).registered_domain
         return url
     else:
         return ""
Ejemplo n.º 3
0
 def str_clean1(self, item):
     extractor = URLExtract()
     if item.startswith("/url?q="):
         if extractor.find_urls(item) != list():
             url = extractor.find_urls(item)[0]
             url = tldextract.extract(url).registered_domain
             return url
     else:
         return ""
Ejemplo n.º 4
0
def get_pages_urls(sitemap):
    extractor = URLExtract()
    pages = sitemap.split('\n')
    page = '<loc>https://www.qwerty.com/blog22.xml.gz</loc> <loc>https://www.qwerty.com/blog22.xml.gz</loc>'
    url = extractor.find_urls(page)
    # print('url: ', url)
    urls = [extractor.find_urls(k)[0] for k in pages if k.endswith('</loc>')]
    # print('urls: ', urls)
    return urls
Ejemplo n.º 5
0
def check_spreadsheet(schedule_sheet, spreadsheet_id, drive):
    today_weekday = datetime.date.today().weekday()
    current_hour = datetime.datetime.now().hour

    url_extractor = URLExtract()

    sheets_document = schedule_sheet.values().get(spreadsheetId=spreadsheet_id,
                                                  range=RANGE_NAME, valueRenderOption='FORMULA').execute()

    posting_schedule = sheets_document.get('values', [])

    for index, posting_day_schedule in enumerate(posting_schedule):

        post_vk, post_tg, post_fb, publish_day, publish_time, \
        post_text_link, post_image_link, is_published = posting_day_schedule

        is_today = publish_day == WEEKDAYS[today_weekday]
        is_now = publish_time == current_hour
        is_already_published = is_published.lower().strip() == 'нет'

        if not (is_today and is_now and is_already_published):
            continue

        text_url = url_extractor.find_urls(post_text_link)[0]
        text_file_id = urlparse(text_url).query[3:]

        image_url = url_extractor.find_urls(post_image_link)[0]
        image_file_id = urlparse(image_url).query[3:]


        with tempfile.NamedTemporaryFile() as image_tempfile, tempfile.NamedTemporaryFile() as text_tempfile:
            get_post_text(text_file_id, text_tempfile.name, drive)
            get_post_image(image_file_id, image_tempfile.name, drive)

            terminal_commands = ['python3', 'vk_tg_fb_posting.py', image_tempfile.name, text_tempfile.name]

            if post_vk.lower().strip() == 'да':
                terminal_commands.append('-pv')
            elif post_fb.lower().strip() == 'да':
                terminal_commands.append('-pf')
            elif post_tg.lower().strip() == 'да':
                terminal_commands.append('-pt')

            exit_code = subprocess.call(terminal_commands)

            if exit_code:
                print('Program "vk_tg_fb_posting.py" finished with exit code', exit_code)

            schedule_sheet.values().update(
                spreadsheetId=spreadsheet_id,
                range="'Лист1'!H{}".format(index + 3),
                body={'values': [['да'], ]},
                valueInputOption='RAW').execute()
            print('Post was published successfully!')
Ejemplo n.º 6
0
def extract_urls(text):
    extractor = URLExtract()
    if type(text) is str:
        urls = extractor.find_urls(text)
        return urls
    elif type(text) is list:
        urls = []
        for x in text:
            url_x = extractor.find_urls(x)
            urls.extend(url_x)
        return urls
    else:
        print("Provided text type (%s) is not currently supported. Please supply either a list of string objects or a string object." % str(type(text)))
Ejemplo n.º 7
0
def extract_urls(text):
    extractor = URLExtract()
    if type(text) is str:
        urls = extractor.find_urls(text)
        return urls
    elif type(text) is list:
        urls = []
        for x in text:
            url_x = extractor.find_urls(x)
            urls.extend(url_x)
        return urls
    else:
        print(
            "Provided text type (%s) is not currently supported. Please supply either a list of string objects or a string object."
            % str(type(text)))
Ejemplo n.º 8
0
def extract_urls(path):

    text = textract.process(path)
    string = str(text, encoding='utf-8')
    extractor = URLExtract()
    urls = extractor.find_urls(string)
    return urls
Ejemplo n.º 9
0
def clean_url(doc):
    extractor = URLExtract()
    urls = list(set(extractor.find_urls(doc)))
    with_url = doc.split()
    remove_url = ' '.join(filter(lambda x: x not in urls, with_url))
    remove_url = remove_url.lower()
    return remove_url
Ejemplo n.º 10
0
class CodeSharingCog(Cog):
    """A meta cog to inform admins abouts bot's status."""

    _maybe_try = f'Maybe try {Hastebin.server} next time?\n'
    single_link_message = 'I see a paste link in your message. ' + _maybe_try
    multiple_links_message = 'I see multiple paste links in your message. ' + _maybe_try

    def __init__(self, bot):
        self.extractor = URLExtract()

    @Cog.listener()
    async def on_message(self, message):
        if message.author.bot:
            return

        hastebin_urls = []

        urls = self.extractor.find_urls(message.content)
        for url in urls:
            if urlparse(url).netloc in Hastebin.converters.keys():
                link = await Hastebin.from_link(url)
                hastebin_urls.append(link)
        if hastebin_urls:
            if len(hastebin_urls) == 1:
                msg = self.single_link_message + hastebin_urls[0]
            else:
                msg = self.multiple_links_message + '\n'.join(hastebin_urls)

            await message.channel.send(msg)
Ejemplo n.º 11
0
def extractIOC(path):
    extractor = URLExtract()
    try:
        out = execute_command('src\\strings64.exe ' + path)
    except:
        out = execute_command('src\\strings64.exe ' + path)
    out = out.decode("utf-8").split('\n')
    extract_url = []
    ipv4 = []
    ipv6 = []
    emails = []
    for url in iocextract.extract_urls(str(out), refang=True, strip=True):
        n = extractor.find_urls(url)
        try:
            n = n[0]
            n = str(n).replace("\\r", "")
            extract_url.append(n)
        except:
            pass
    extract_url = list(set(extract_url))
    for ip4 in iocextract.extract_ipv4s(str(out), refang=True):
        ipv4.append(ip4)
    for ip6 in iocextract.extract_ipv6s(str(out)):
        ipv6.append(ip6)
    for email in iocextract.extract_emails(str(out), refang=True):
        emails.append(str(email).replace("\\r", ""))
    return (extract_url, ipv4, ipv6, emails)
def extract_URLs(content):

    if content is not None:
        print ("\n***** Extract URLs *****\n")
        ### Identify URLs in content ###
        extractor = URLExtract();
        urls  = extractor.find_urls(content)            # returns list of urls
        #iocs  = list(iocextract.extract_urls(content))  # another method for extracting urls

        print ("extractor.find method")
        print (urls)
        #print ("iocextract.extract_urls method")
        #print (iocs)

        info_to_evaluate = urls# + iocs

        index = 0

        # Occassionally, the functions above return urls with trailing commas.  Remove these.
        for url in info_to_evaluate:
            if url.endswith(','):
                info_to_evaluate[index] = url[:-1]
            index += 1

        print ("Removed trailing commas")
        print (info_to_evaluate)

        print ("Successfully extracted URLs")

        return info_to_evaluate
def getImageLinks(url):

    # Load intermediate google ads website
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
    }
    result = requests.get(url, headers=headers)
    soup = BeautifulSoup(result.content, 'html.parser')
    scripts = soup.findAll("script")

    # Extract actual link to website
    text = scripts[len(scripts) - 1].get_text()
    extractor = URLExtract()
    urls = extractor.find_urls(text)
    actualURL = urls[0]

    result = requests.get(actualURL, headers=headers)
    soup = BeautifulSoup(result.content, 'html.parser')

    returnLink = list()

    for link in soup.findAll("a", {"class": "action-expand"}):
        returnLink.append(link.get('href'))
    return (returnLink)
Ejemplo n.º 14
0
def downloadImages(page):

    for i in range(1, page):
        print(i)
        driver.get(
            "view-source:https://pixabay.com/images/search/orchid%20flower/?pagi="
            + str(i))
        pageSource = driver.page_source
        soup = BeautifulSoup(pageSource, "html.parser")
        urls = soup.findAll("span", {"class": "html-attribute-value"})
        extractor = URLExtract()

        a = 0

        for url in urls:
            extractedUrls = extractor.find_urls(url.text)
            for extUrl in extractedUrls:
                if ".jpg" in extUrl or ".png" in extUrl:
                    a += 1
                    imagelist.add(extUrl)

        for x in imagelist:
            print(x)

        print("----------------------------")
        print(len(imagelist))
        time.sleep(3)
Ejemplo n.º 15
0
def test_dns_cache_init():
    """Testing creating a new DNS caching resolver"""
    default_resolver = dns.resolver.get_default_resolver()
    assert default_resolver == dns.resolver.default_resolver
    if default_resolver:
        dns.resolver.default_resolver = None

    underscore_resolver = dns.resolver._resolver
    if underscore_resolver:
        dns.resolver._resolver = None

    urlextract = URLExtract()
    assert dns.resolver.default_resolver is None
    assert dns.resolver._resolver is None

    results = urlextract.find_urls("https://github.com", check_dns=True)
    assert len(results) == 1

    if not ExceptionCachingResolver:
        assert dns.resolver.default_resolver is None

    resolver = dns.resolver._resolver
    assert resolver is not None
    assert resolver.cache is not None
    assert resolver.cache.data is not None
    assert len(resolver.cache.data) == 1
Ejemplo n.º 16
0
def test_dns_cache_reuse():
    """Testing re-using an existing DNS caching resolver"""
    underscore_resolver = dns.resolver._resolver
    if underscore_resolver:
        dns.resolver._resolver = None

    default_resolver = dns.resolver.get_default_resolver()
    if ExceptionCachingResolver:
        assert default_resolver.__class__ == ExceptionCachingResolver

    assert default_resolver == dns.resolver.default_resolver
    cache = dns.resolver.LRUCache()
    default_resolver.cache = cache

    urlextract = URLExtract()
    assert dns.resolver._resolver is None

    results = urlextract.find_urls("https://github.com", check_dns=True)
    assert len(results) == 1

    assert dns.resolver._resolver is not None
    assert dns.resolver.default_resolver == default_resolver
    assert dns.resolver.default_resolver.cache == cache

    assert default_resolver.cache.data is not None
    assert len(default_resolver.cache.data) == 1
Ejemplo n.º 17
0
    def on_pubmsg(self, c, e):
        a = e.arguments[0].split(":", 1)
        if len(a) > 1:
            nick = e.source.nick
            extractor = URLExtract()
            urls = extractor.find_urls(a[1].strip())
            db_path = "{}links.db".format(os.getenv("IRC_db_path", "./"))
            with Database(db_path) as db:
                for url in urls:
                    db.execute(
                        "INSERT INTO links (datetime, nick, url) VALUES" +
                        " (?,?,?)",
                        (
                            str(f"{datetime.datetime.now():%Y-%m-%d %H:%M:%S}"
                                ),
                            nick,
                            url,
                        ),
                    )

                    print("{} {} posted {}".format(
                        str(f"{datetime.datetime.now():%Y-%m-%d %H:%M:%S}"),
                        nick,
                        url,
                    ))
Ejemplo n.º 18
0
 async def on_member_join(self, member):
     extractor = URLExtract()
     urls = extractor.find_urls(f'{member.name}')
     if not urls:
         print('no urls found good to go')
     else:
         print(urls)
         await member.guild.kick(member)
Ejemplo n.º 19
0
def extract_queries(dic):
    """
    The function receives a dictionary and returns the queries that are not urls
    """
    extractor = URLExtract()
    for elem in dic.keys():
        if extractor.find_urls(elem) == []:
            dic_all_queries[elem] = dic[elem]
Ejemplo n.º 20
0
 def word_validator(self, docs, replace=''):
     extractor = URLExtract()
     urls = extractor.find_urls(docs)
     for url in urls:
         docs = docs.replace(url, '')
     docs = re.sub(r'[!-/]|[:-@]|[\[-`]|[\{-~]|[︰-@]|\n|[\u3000-\u3030]',
                   '', docs)
     return docs
Ejemplo n.º 21
0
 def parse_url(self, content):
     """
     :param content: content of a file for parsin
     :return: list of parsed url from content
     """
     extractor = URLExtract()
     url_list = extractor.find_urls(content, only_unique=True)
     return url_list
def remove_urls(text):
    extractor = URLExtract()
    urls = extractor.find_urls(text)
    for url in urls:
        text = text.replace(url, ' ')
    text = text.replace('  ', ' ').strip()
    # print(urls)
    return text
Ejemplo n.º 23
0
def getURLDomains(text):
    extractor = URLExtract()
    urls = extractor.find_urls(text)
    domains_list = []
    for url in urls:
        domain = tldextract.extract(url).domain
        domains_list.append(domain)
    return domains_list
Ejemplo n.º 24
0
def get_band_disco(soup, current_records):
    # Instancia de URLExtract.
    extractor = URLExtract()

    # Abrimos sesión con la base de datos.
    session = Session()

    # Del objeto "soup" (el contenido será parecido a band_page.html) encuentra <div id="band_disco">.
    disco_finder = soup.find("div", {"id": "band_disco"})
    # Los tags resultantes pasan a string.
    s_disco_finder = str(disco_finder)
    # Extrae todos los URLs presentes.
    disco_url = extractor.find_urls(s_disco_finder)

    # Toma el primer URL y asigna a una variable.
    url = disco_url[0]
    # Hace un request con dicho URL.
    r = requests.get(url)

    # Algo para los caracteres raros, por si los hay.
    r.encoding = 'utf-8'

    # Convierte el response en un objeto BeautifulSoup para su uso.
    disco_soup = BeautifulSoup(r.content, 'html.parser')

    # Del objeto "disco_soup" (el contenido será parecido a disco.html) obtiene todos los tags <tr>.
    disco_entries = disco_soup.find_all("tr")

    # Elimina el primero porque no se necesita.
    disco_entries.pop(0)

    # -> Por cada elemento en disco_entries:
    for item in disco_entries:
        # -> Instanciamos la discografía e insertamos.
        discography = fact.factory("discography")
        discography.band_id = current_records
        # -> Intentamos:
        try:
            # -> En un ciclo de x < 3:
            for x in range(3):
                # -> Busca todos los tags <td> usando el índice 'x'.
                s = item.find_all("td")[x]
                # -> Como en este caso los atributos de la discografía vienen en 3 partes, condicionamos:
                if x == 0:
                    discography.name = str(s.getText())
                if x == 1:
                    discography.release_type = str(s.getText())
                if x == 2:
                    discography.year = str(s.getText())
                # -> Agregamos el row.
                session.add(discography)
            # Guardamos cambios.
            session.commit()
            # Cerramos sesión.
            session.close()
        except:
            # En caso de que la banda no tenga releases sólo pasa al siguiente.
            session.close()
Ejemplo n.º 25
0
    def download_links_in_html_as_attachments(self, p_html: str,
                                              p_extensions: List[str]):
        """ Scans the given HTML file, finds links, downloads
        the files and saves them as attachments.
        This method supports only text attachments at this time.
        """
        # Build clean HTML
        if p_html is None or len(p_html) <= 0:
            return
        clean_html = p_html.replace("\r", "").replace("\n", "")
        html_tag_pos = clean_html.lower().find("<html")
        if html_tag_pos < 0:
            return
        clean_html = clean_html[html_tag_pos:]

        # Extract URL's
        extractor = URLExtract()
        urls = extractor.find_urls(clean_html)

        # Download as necessary
        for url in urls:
            low_url = url.lower()
            has_eligible_extension = False
            for extension in p_extensions:
                low_extension = "." + extension.lower()
                if low_extension in low_url:
                    has_eligible_extension = True
                    break
            if not has_eligible_extension:
                continue

            if "urldefense.com" in url:
                real_http_pos = low_url.rfind("http")
                clean_url = url[real_http_pos:].replace("__", "")
            else:
                clean_url = url
            if clean_url[-1] == "/":
                clean_url = clean_url[:-1]

            filename = os.path.basename(clean_url)
            dummy_name, extension = os.path.splitext(filename)
            extension = extension.replace(".", "")
            file_format = Attachment.guess_format_by_file_extension(extension)

            response = requests.get(clean_url, allow_redirects=True)

            if file_format == AttachmentFormat.text:
                downloaded_attachment = Attachment(
                    p_name=filename,
                    p_format=AttachmentFormat.text,
                    p_text_content=response.text)
            else:
                downloaded_attachment = Attachment(
                    p_name=filename,
                    p_format=AttachmentFormat.binary,
                    p_binary_content=response.content)

            self.attachments.append(downloaded_attachment)
Ejemplo n.º 26
0
    def crawler():
        # Instancia de URLExtract.
        extractor = URLExtract()

        # Parametros para iniciar el scraping en el DataTable.
        page = 1
        display_start = 0
        current_records = 0

        # URL inicial.
        target_url = S.current_target_url(page, display_start)

        # Records totales y entradas de la primera página.
        total_records =1#get_total_records(target_url)
        json_data = S.get_json_data(target_url)


        # Mientras no se pase del total de records:
        while current_records < total_records:
            # -> Por cada 500 elementos:
            for x in range(500):
                # -> Condición para cuando no hay más que buscar.
                if current_records == total_records:
                    break

                # -> Contador de records actuales.
                current_records += 1

                # -> Castea los datos de las bandas desde el DT a string.
                # -> Busca el URL de su respectivo perfil.
                s_json_data = str(json_data["aaData"][x][0])
                extracted_url = extractor.find_urls(s_json_data)

                # -> Se hace un get al URL extraído (perfil de la banda).
                r = requests.get(extracted_url[0])
                # -> Si el request sale bien:
                if r.status_code == 200:
                    soup = BeautifulSoup(r.content, 'html.parser')

                    # -> Hacemos el parseo de los atributos junto con la inserción objeto -> BD.
                    S.get_band_attributes(soup)
                    S.get_band_disco(soup, current_records)
                    S.get_band_members(soup, current_records)

                # -> 2 segundos de espera por ciclo.
                time.sleep(2)

            # Cada 500 entradas:
            # -> Cambia de pagina en DataTable.
            page += 1
            # -> Muestra las siguientes 500 bandas.
            display_start += 500

            # Actualización de URL inicial.
            target_url = S.current_target_url(page, display_start)
            # Actualización de json inicial.
            json_data = S.get_json_data(target_url)
Ejemplo n.º 27
0
def get_urls(body) -> List[str]:
    # Set up an extractor to extract the URLs
    extractor = URLExtract()
    # Update the TLD list if it is older than x days
    extractor.update_when_older(7)
    # Run the extractor and remove any duplicates
    urls = extractor.find_urls(body, only_unique=True)

    return urls
Ejemplo n.º 28
0
def word_validator(docs, replace='URL'):
    extractor = URLExtract()
    urls = extractor.find_urls(docs)
    for url in urls:
        docs = docs.replace(url, replace)

    # docs = re.sub(r'[︰-@]|\n|[\u3000-\u3030]', "", docs)

    return docs
def retrieve_doi(url):
    dois = []
    with urllib.request.urlopen(url) as url:
        s = url.read().decode('utf-8')
        text = s.replace(' ', '').replace('=', '')
        extractor = URLExtract()
        urls = extractor.find_urls(text)
        dois = find_doi_string_regex(urls)
    return dois
Ejemplo n.º 30
0
def replace_url(s):
    extractor = URLExtract()

    if extractor.has_urls(s):
        urls = extractor.find_urls(s, only_unique=True)
        for url in urls:
            s = s.replace(url, "<url>")

    return s
Ejemplo n.º 31
0
    def initialize(self):
        self.logNotify("Initializing OpenSelery")

        # return openselery version
        selerypath = os.path.realpath(__file__)
        # return openselery version
        self.log("OpenSelery HEAD sha [%s]" %
                 git_utils.get_head_sha(selerypath))
        self.log("OpenSelery last tag [%s]" %
                 git_utils.get_lastest_tag(selerypath))

        # initialize config dict with default from template
        self.log("Preparing Configuration")
        self.config = OpenSeleryConfig()
        # parse args
        self.log("Parsing arguments")
        args = self.parseArgs()
        # apply args dict to config
        self.config.apply(vars(args).items())
        # apply yaml config to our configuration if possible
        self.log("Loading configuration [%s]" % self.config.config_path)
        self.loadYaml(self.config.config_path)
        # load our readme file
        extractor = URLExtract()
        fundingPath = self._getFile("README.md")
        if fundingPath is not None:
            self.log("Loading funding file [%s] for bitcoin wallet" %
                     fundingPath)
            mdfile = open('README.md', 'r')
            mdstring = mdfile.read()
            urls = extractor.find_urls(mdstring)
            badge_string = "https://en.cryptobadges.io/donate/"
            for url in urls:
                if badge_string in url:
                    self.config.bitcoin_address = url.split(badge_string, 1)[1]
                    self.log("Found bitcoin address [%s]" %
                             self.config.bitcoin_address)
        else:
            self.log(
                "Using bitcoin address from configuration file for validation check [%s]"
                % self.config.bitcoin_address)
        # load tooling url

        if self.config.include_tooling_and_runtime and self.config.tooling_path:
            with open(self.config.tooling_path) as f:
                self.config.toolrepos = yaml.safe_load(f)
            if self.config.toolrepos is not None:
                self.log("Tooling file loaded [%s]" % self.config.toolrepos)
            else:
                self.log("No tooling urls found")
        else:
            self.log("Tooling not included")

        # load our environment variables
        self.loadEnv()
        self.logNotify("Initialized")
        self.log(str(self.getConfig()))
Ejemplo n.º 32
0
def extract_urls(text):
    extractor = URLExtract()
    urls = extractor.find_urls(text)
    return urls