Ejemplo n.º 1
0
def summarized():
    if request.method == 'POST':
        user_input = request.form['user_input']
        summary_length = int(request.form['summary_length'])

        if checkers.is_url(user_input):
            if not evaluate.check_url_valid(user_input):
                abort(404, user_input)

        if request.form.get('checkmode'):
            check_mode = request.form['checkmode']
            ref_summary = request.form['ref_summary']

            if checkers.is_url(ref_summary):
                if not evaluate.check_url_valid(ref_summary):
                    abort(404, ref_summary)
        else:
            check_mode = False
            ref_summary = ""

        #BERT_summary
        #bert_summary = summarizers.bert_summary(user_input, ref_summary, check_mode, num_sentences_out = summary_length)

        #LSA_Summary
        lsa_summary = summarizers.lsa_summary(user_input, ref_summary, check_mode, num_sentences_out = summary_length)
        
        #luhn_summary
        luhn_summary = summarizers.luhn_summary(user_input, ref_summary, check_mode, num_sentences_out = summary_length)

        #LEX_Summary
        lex_summary = summarizers.lex_summary(user_input, ref_summary, check_mode, num_sentences_out = summary_length)

        #RESULTS
        sum_result =  {"user_input": user_input,
                        "summary_length": summary_length,
                        "check_mode": check_mode,
                        "ref_summary": ref_summary,
                        "summaries": {  
                            #"BERT SUM": bert_summary,
                            "Latent Semantic Analysis": lsa_summary, 
                            "Luhn": luhn_summary, 
                            "Lex Rank": lex_summary
                        }
                        }

        if request.form.get('checkmode'):
            #get best summary
            best_summary = evaluate.get_best_summary(sum_result)
            sum_result["best_summary"] = best_summary

            #Arrange summaries
            sum_result["summaries"] = evaluate.order_summary(best_summary, sum_result)

        

        return render_template("summary_results.html", results = sum_result)
    else:
        return render_template("summary.html")
Ejemplo n.º 2
0
def validate(row: pd.Series):
    """
    入力データに対する、お気持ち程度のバリデーションチェック
    """
    official_page = row["official_page"]
    if official_page and checkers.is_url(official_page) == False:
        raise ValidationWarning("公式URL(officical_page)が不正です")
    detail_page = row["detail_page"]
    if detail_page and checkers.is_url(detail_page) == False:
        raise ValidationWarning("詳細ページ(detail_page)のURLが不正です")

    # 郵便番号、電話番号の書式のバリデーション(厳密ではない)
    remove_char_regex = r"[ -‐-‑ー−‒–—―ー ]"  # (区切り文字適当)
    tel = re.sub(remove_char_regex, "", row["tel"])
    if tel and not re.match(r"^0\d{9,10}$", tel):
        raise ValidationWarning("電話番号(tel)の書式が不正です")  # 0始まりの半角数字9〜10桁
    zip_code = re.sub(remove_char_regex, "", row["zip_code"])
    if zip_code and not re.match(r"\d{7}$", zip_code):
        raise ValidationWarning("郵便番号(zip_code)の書式が不正です")  # 半角数字7桁

    # HTMLタグが含まれてほしくないやつに含まれている
    for target in [
            "shop_name",
            "address",
            "official_page",
            "detail_page",
            "opening_hours",
            "closing_day",
            "area_name",
    ]:
        text = row.get(target)
        if not text:
            continue
        if len(text) != len(w3lib.html.remove_tags(text)):
            raise ValidationWarning(f"{target}にHTMLタグが含まれています")

    # 郵便番号でのジオコーディング結果に対する正当性チェック
    try:
        zip_code = row["zip_code"]
        if not zip_code:
            return
        pref = cached_posuto_pref(zip_code)
    except KeyError:
        # MEMO: posutoのデータには存在しない(特殊な)郵便番号が指定されている場合がある
        # いわゆる「大口事業所個別番号」というやつで、そういうのはどうしようもないのでバリデーション成功とする
        logger.info(f"不明な郵便番号です (「大口事業所個別番号」かも?) : zip code={zip_code}")
        return
    except Exception as e:
        # MEMO: その他特殊すぎる郵便番号などでposuto内部でエラーが起きた場合
        logger.warning(e, stack_info=True)
        logger.warning(f"unknown posuto error, zip code={zip_code}")
        raise ValidationWarning(f"posutoでエラーになる郵便番号です(内部処理エラー)")

    norm_addr = row.get("normalized_address")
    if norm_addr and not norm_addr.startswith(pref):
        raise ValidationWarning(
            f"郵便番号から求められた都道府県は {pref} ですが、ジオコーディングされた住所は {norm_addr} です")
Ejemplo n.º 3
0
def extract_data(html_page):
    temporary_variable = ''
    parsed_html_page = parse_html_content(html_page.content)
    links = parsed_html_page.find_all('a')

    for link in links:
        if link.get('href') == '/locations/':
            temporary_variable = link.get('href')
            break

    if temporary_variable != '' and (not checkers.is_url(temporary_variable)):
        CustomConstants.URL_TO_BE_VISITED.add(
            NetworkUtil.get_absolute_url(temporary_variable))
    else:
        return CustomConstants.SOMETHING_WENT_WRONG_WHILE_FETCHING_LOCATIONS

    html_page = NetworkUtil.read_from_network(
        CustomConstants.URL_TO_BE_VISITED.pop())
    parsed_html_page = parse_html_content(html_page.content)
    location_cards = parsed_html_page.find_all(class_='location card')

    if len(location_cards) > 0:
        clear_set_data()

    for location_card in location_cards:
        link = location_card.get('href')
        if checkers.is_url(link):
            CustomConstants.URL_TO_BE_VISITED.add(link)
        else:
            link = NetworkUtil.get_absolute_url(link)
            CustomConstants.URL_TO_BE_VISITED(link)

    room_links = set()

    for location in CustomConstants.URL_TO_BE_VISITED:
        html = NetworkUtil.read_from_network(location)
        parsed_html = parse_html_content(html.content)
        room_links.update(extract_rooms_feed(parsed_html))
        time.sleep(3.0)

    clear_set_data()

    room_detail_list = list()

    for room_link in room_links:
        html_page = NetworkUtil.read_from_network(room_link)
        parsed_html_page = parse_html_content(html_page.content)
        room_detail = extract_room_detail(parsed_html_page)
        room_detail_list.append(room_detail)
        time.sleep(3.0)
    return room_detail_list
def valid_url(url: str) -> bool:
    """Check that the URL is well formatted."""
    parsed_url = urlparse(url)
    if not (checkers.is_url(parsed_url.geturl())
            or checkers.is_ip_address(parsed_url.geturl())):
        # prepend http if missing
        parsed_url = parsed_url._replace(**{"scheme": "http"})
        parsed_url = parsed_url._replace(**{"netloc": parsed_url[2]
                                            })  # move path to netloc
        parsed_url = parsed_url._replace(**{"path": ""})
        # check again with fixed url
        if not (checkers.is_url(parsed_url.geturl())
                or checkers.is_ip_address(parsed_url.geturl())):
            return False
    return True
Ejemplo n.º 5
0
def crawler(nthreads=None, url=None, output=None, all_links=None):
    """
    Web crawler starts from URL to all found links under the same netloc.
    """
    # Check if URL is valid
    url = url
    if not checkers.is_url(url):
        print(f'The url you have entered is not valid. URL: {str(url)}')
        exit(1)

    num_threads = nthreads
    output = output
    all_links = all_links

    crawler = Crawler(url=url,
                      num_threads=num_threads,
                      output=output,
                      all_links=all_links)

    start = time()
    crawler.start()
    end = time()

    crawler.print_results()

    print(f'\n\nCrawling took {int(end-start)} seconds.')

    return
Ejemplo n.º 6
0
    async def run_image(self, context, opts):
        """
        Update the image link of a project
        @param context:
        @param opts:
        @return:
        """
        user = User(context.message.author.id, context.guild.id, context)
        shortname = opts[0].lower() if opts else None
        img = opts[1] if len(opts) > 1 else None

        # Make sure the project exists.
        project = user.get_project(shortname)
        if not project:
            return await context.send(
                user.get_mention() + ', ' +
                lib.get_string('project:err:noexists',
                               user.get_guild()).format(shortname))

        # Check it's a valid image link.
        if not checkers.is_url(img) and img is not None:
            return await context.send(
                user.get_mention() + ', ' + lib.get_string(
                    'project:err:link', user.get_guild()).format(img))

        project.set_image(img)
        return await context.send(
            user.get_mention() + ', ' +
            lib.get_string('project:image', user.get_guild()))
Ejemplo n.º 7
0
def load(source) -> etree._Element:  # pylint: disable=protected-access
    '''
    Load an XML document
    args:
        source: XML source. Either path, url, string, or loaded LXML Element
    returns:
        Loaded XML object tree, or None on invalid source
    '''
    if not isinstance(source, (str, bytes)) or len(source) < 1:
        # pylint: disable=protected-access
        return source if isinstance(source, etree._ElementTree) else None

    source = source.strip()
    if source[0] == ord('<'):  # Handle source as bytes
        source = io.BytesIO(source)
    elif source[0] == '<':  # Handle source as string
        source = io.StringIO(source)
    elif checkers.is_file(source):  # Handle source as local file
        pass  # etree.parse handles local file paths natively
    elif checkers.is_url(source):  # Handle source as URL
        response = requests.get(source, timeout=10)
        if not response:
            app.logger.warning(
                f"Failed to retrieve XML URL (or timed out): {source}")
            return None
        source = io.BytesIO(response.content)
    else:
        app.logger.warning(
            f"XML source is not valid file, URL, or XML string. {source[:40]}"
            + (len(source) > 40) * '...')
        return None

    return etree.parse(source)
Ejemplo n.º 8
0
def input_title(event, context):
    url = event['url']

    # dynamo table info
    table = os.environ.get('DYNAMO_TABLE')

    # validate url
    if checkers.is_url(url):

        key = str(hash(url))

        # dynamo operations
        dynamo = boto3.resource('dynamodb', region_name=region)
        table = dynamo.Table(table)
        table.put_item(Item={'titleid': key, 'url': url, 'status': 'PENDING'})

        return {
            'statusCode': 200,
            'body': json.dumps({"id": key}),
            'headers': {
                'Content-Type': 'application/json',
            }
        }
    else:
        return {
            'statusCode': 400,
            'body': {
                "error": "invalid url"
            },
            'headers': {
                'Content-Type': 'application/json',
            }
        }
Ejemplo n.º 9
0
def get_profile_urls(driver, n_pages=5):
    """
    Return a list without repetitions of alphabetically sorted URLs
    taken from the results of a given query on Google search.

    :param driver: selenium chrome driver object
    :param n_pages: int number of google pages to loop over
    :return: list of linkedin-profile URLs
    """
    linkedin_urls = []
    for i in range(n_pages):
        urls = driver.find_elements_by_class_name("yuRUbf [href]")
        #links = [url.get_attribute('href') for url in urls]

        linkedin_urls += [
            url.get_attribute('href') for url in urls
            if checkers.is_url(url.get_attribute('href')) and re.search(
                "^https://[a-z]+\.linkedin\..*$", url.get_attribute('href'))
        ]
        print(linkedin_urls)
        sleep(0.5)
        if i > 1:
            try:
                next_button_url = driver.find_element_by_css_selector(
                    '#pnnext').get_attribute('href')
                driver.get(next_button_url)
            except NoSuchElementException:
                break
    linkedin_urls_no_rep = sorted(
        list(dict.fromkeys([url for url in linkedin_urls])))
    return linkedin_urls_no_rep
Ejemplo n.º 10
0
async def parse_search(ctx, search: str, loop: asyncio.BaseEventLoop = None):

    loop = loop or asyncio.get_event_loop()
    source_type = "GDrive"
    if checkers.is_url(search):
        return search

    gdrive_folder_id = config['gdrive_id']
    if not gdrive_folder_id:
        return search

    source_init = Source(ctx, source_type=source_type, loop=loop)
    try:
        sources = await source_init.get_playlist(gdrive_folder_id,
                                                 include_name=True)
    except SourceError as e:
        await ctx.send(
            'An error occurred while processing this request: {}'.format(
                str(e)))
        return search

    for each_source in sources:
        if search.lower() in each_source['name'].lower():
            search = f"https://drive.google.com/file/d/{each_source['id']}/view"
            break

    return search
Ejemplo n.º 11
0
def create(event, context):
    data = json.loads(event['body'])

    if 'url' not in data:
        logging.error('URL parameter not provided')
        return {
            'statusCode': 422,
            'body': json.dumps({'error_message': 'Insufficient data'})
        }

    url = data['url']

    if not url:
        logging.error('URL value missing')
        return {
            'statusCode': 422,
            'body': json.dumps({'error_message': 'URL missing'})
        }

    if not checkers.is_url(url):
        logging.error('URL is invalid')
        return {
            'statusCode': 422,
            'body': json.dumps({'error_message': 'URL invalid'})
        }

    if 'id' in data:
        id = data['id']
    else:
        id = generate(size=6)

    url_added = UrlModel(id=id, url=url, created=datetime.now())
    url_added.save()

    return {'statusCode': 200, 'body': json.dumps({'id': id, 'url': url})}
Ejemplo n.º 12
0
def generate_shop_url(domain_or_url):
    url = None
    if checkers.is_domain(domain_or_url):
        url = f'https://{domain_or_url}'
    elif checkers.is_url(domain_or_url):
        url = domain_or_url
    return url
Ejemplo n.º 13
0
    def test_build_url(self):
        """Test URL builder."""
        # without URL parameters
        item_url = self.plg_utils.build_url(
            base_url="https://guts.github.io/mkdocs-rss-plugin/",
            path="changelog")
        self.assertTrue(checkers.is_url(item_url))

        # with URL parameters
        item_url = self.plg_utils.build_url(
            base_url="https://guts.github.io/mkdocs-rss-plugin/",
            path="changelog",
            args_dict={"utm_source": "test_rss"},
        )
        print(item_url)
        self.assertTrue(checkers.is_url(item_url))
Ejemplo n.º 14
0
def register(url):
    """Registers a URL. Validates input URL, and returns with a non-zero exit
    code if the URL is invalid. If the URL is valid, adds the URL to a internal,
    persistent registry.
    """
    # Check if URL is valid using regex
    if not checkers.is_url(url):
        print("Error: invalid URL.")
        return sys.exit(os.EX_DATAERR)

    home = os.path.expanduser("~")
    file_path = os.path.join(home, FILE_PATH)

    if not os.path.exists(FILE_DIR):
        try:
            os.makedirs(os.path.dirname(FILE_DIR))
        except OSError as exc:  # Guard against race condition
            if exc.errno != errno.EEXIST:
                raise

    open(file_path, "a").close()

    with open(file_path, "r+") as file:
        for line in file:
            if url in line:
                file.close()
                return sys.exit(os.EX_OK)
        # Write a space delimited row of the form "url content_num_bytes,
        # load_time_seconds, refresh_time.
        file.write(" ".join([url, "0", "0", "0"]) + "\n")
    file.close()
    return sys.exit(os.EX_OK)
Ejemplo n.º 15
0
def validate_url(url):
    """
    Determine if a given URL is valid.  Return True if so, False if not
    """
    if not url:
        raise ValueError("No url provided")

    return checkers.is_url(url)
Ejemplo n.º 16
0
async def handle_url(message: Message, state: FSMContext):
    if checkers.is_url(message.text):
        await States.url.set()
        await state.update_data(url=message.text)
        markup = get_keyboard()
        await message.answer("Options:", reply_markup=markup)
    else:
        await message.answer("Invalid link.")
Ejemplo n.º 17
0
 def source_create_url_input(self, messege) -> str:
     '''Validates url input for source creation.'''
     input_text = input(WHITE + f"\n{messege}: ")
     if checkers.is_url(input_text) is False:
         print(
             RED +
             "Error: Entered data is invalid. Please check and try again.")
         return self.source_create_url_input(messege)
     return input_text  # returns entered string if all validation rules are met
Ejemplo n.º 18
0
def test_extract_single():
    extracted = extract_single("http://google.com")
    # Image path probably pretty stable; ideally we extract from a page
    # that never changes like a self-hosted testing page set up for this
    # purpose.
    assert re.findall("http://google.com/images.*.png", extracted["images"][0])

    for link in extracted["links"]:
        assert checkers.is_url(link)
Ejemplo n.º 19
0
def url(u):
    """Download a video of a given URL."""

    if checkers.is_url(u):
        if download(u):
            click.echo('Success!')
        else:
            click.echo('Download error. No fap :(')
    else:
        click.echo('URL error.')
Ejemplo n.º 20
0
Archivo: cli.py Proyecto: timaa2k/m
def upload(ctx: Dict[str, Any], tags: str, filepath: str) -> None:
    api = ctx['api']
    namespace = ctx['namespace']
    t = remove_if_in_target(namespace, tags.split('/'))
    if checkers.is_url(filepath):
        link = f'<head><meta http-equiv="Refresh" content="0; URL={filepath}"></head>'
        print(api.put_latest(tags=namespace + t, content=str.encode(link)))
        return
    with Path(filepath).open(mode='rb') as f:
        print(api.put_latest(tags=namespace + t, content=f))
Ejemplo n.º 21
0
 async def e_img(self, ctx, event_id, url):
     """ check if event exist, valid format, update image url field """
     if self.get_event(event_id) is None:  # no event with this id
         await ctx.send(error_msgs['no_event'].format(event_id))
     elif checkers.is_url(url) is False:  # format url invalid
         await ctx.send(error_msgs["url_format"])
     else:  # update image_url field
         await ctx.send(
             self.update_event(event_id, 'image_url', url,
                               ctx.message.created_at))
Ejemplo n.º 22
0
def index_post():
    text = request.form['text']
    if checkers.is_url(text):
        key = Controller().db_insert(text)
        if key:
            return 'Your new url is http://127.0.0.1:5000/' + key
        else:
            return 'Error'
    else:
        return 'Error: Must be a valid url'
Ejemplo n.º 23
0
def f_valid_url(target):
    import requests
    from validator_collection import validators, checkers
    if checkers.is_url(target):
        request = requests.get(target)
        if request.status_code == 200:
            return 1
        else:
            return 0
    else:
        return 0
Ejemplo n.º 24
0
    async def _network_reach(self):
        """reach website et return HTTP code"""
        response = ""
        if self.args:
            if len(self.args) == 1:
                if checkers.is_url(self.args[0]):
                    response = network.network_reach(self.args[0], "")
                url = "https://" + self.args[0]
                if checkers.is_url(url):
                    response = network.network_reach(self.args[0], "")
            if len(self.args) == 2:
                if checkers.is_url(self.args[0]) and self.args[1] == "details":
                    response = network.network_reach(self.args[0], "details")
                url = "https://" + self.args[0]
                if checkers.is_url(url):
                    response = network.network_reach(self.args[0], "details")

        else:
            response = "Im not soothsayer...Give me an url !"
        await send_text_to_room(self.client, self.room.room_id, response)
Ejemplo n.º 25
0
async def test(ctx, *, url):
    if (checkers.is_url(url)):
        await ctx.send("Just a sec...")
        await testportal.getTest(url)
        screenshots = os.listdir('screenshots')
        lenght = len(screenshots)
        for x in range(lenght):
            await ctx.send(file=discord.File('screenshots\screenshot' +
                                             str(x + 1) + '.png'))
        deleteScreenshots()
    else:
        await ctx.send("The Url is not valid")
Ejemplo n.º 26
0
def _parse_url(param, default):
    if param is not None and not isinstance(param, (bool, str)):
        raise ValueError(f"Invalid parameter input {param}.")

    if param is None or param is False:
        return False
    elif param is True:
        return default
    elif checkers.is_url(str(param)):
        return str(param)
    else:
        raise URLError(f"Invalid url: {param}")
Ejemplo n.º 27
0
def resumeur(request):
    input_texte = request.POST['input_texte']
    #try:
    if checkers.is_url(input_texte):
        url = input_texte
        article_dico = article_extraction(url)
        #article_dico["summary_2"] = ktrain_texte_resumeur(article_dico["texte"], lang='fr')

        #else:
        #pass
    #except: pass
    return render(request, "summarizer/result.html", article_dico)
Ejemplo n.º 28
0
def read_from_network(url):
    try:
        if checkers.is_url(url):
            response = requests.get(url,
                                    headers=CustomConstants.REQUEST_HEADER)
            CustomConstants.URL_VISITED.add(url)
            return response
        else:
            CustomConstants.URL_TO_BE_VISITED.add(url)
            return CustomConstants.URL_IS_NOT_VALID
    except:
        CustomConstants.URL_TO_BE_VISITED.add(url)
        return CustomConstants.ERROR_OCCURED_WHILE_SENDING_REQUEST
Ejemplo n.º 29
0
 async def convert(self, ctx, argument):
     with suppress(Exception):
         mem = await member_converter.convert(ctx, argument)
         return str(mem.avatar_url_as(static_format='png', size=1024))
     with suppress(Exception):
         emoji = await emoji_converter.convert(ctx, str(argument))
         return str(emoji.url)
     if ctx.message.attachments:
         with suppress(Exception):
             return str(ctx.message.attachments[0].url)
     elif checkers.is_url(str(argument)):
         return str(argument)
     else:
         return 'error'
Ejemplo n.º 30
0
def urls_to_collect(urls_file: str):
    """Collect urls given urls in a file."""
    urls = []
    for url in open(urls_file):
        url = url.strip()
        if url.startswith("#"):  # comment lines should be ignored
            continue
        if len(url) == 0:  # ignore empty lines
            continue
        if checkers.is_url(url) is not True:
            logging.warning("invalid url: %s" % url)
            continue
        urls.append(url)
    return urls