Ejemplo n.º 1
0
def parse_post(post):
    # Extract text and strip html tags and links
    content = strip_html_tags(post['the_post']['rawContent'])
    links = extract_urls(content)

    for url in links:
        content = content.replace(url, '')

    try:
        images = list(
            map(lambda x: x['cdnUrl'], post['the_post']['entities']['images']))
    except KeyError:
        images = []

    return content, links, images
Ejemplo n.º 2
0
def get_urls():
    start = time.perf_counter()
    pokedex = utils.get_data("pokedex.json")
    urls = []
    success = 0
    fail = 0
    alola = 0
    for i in range(1, len(pokedex) + 1):
        pokemon = pokedex[str(i)]
        print_prefix = f"{str(i).zfill(3)} {pokemon}"
        pokemon = quote(pokemon)
        if i == 29:
            pokemon += "♀"
        elif i == 32:
            pokemon += "♂"
        api_url = f"https://bulbapedia.bulbagarden.net/w/api.php?action=parse&format=json&page={pokemon}_(Pokémon)"
        r = requests.get(api_url)
        if r.status_code == 200:
            lst = utils.extract_urls(r.text)
            if lst:
                url = lst[2]
                image_url = "https://" + url[:url.rfind("/")].replace(
                    "/thumb/", "/")
                urls.append(image_url)
                print(f"{print_prefix:<20} > {image_url}")
                success += 1

                for url in lst:
                    if pokemon.replace(" ", "_") + "-Alola" in url:
                        image_url = "https://" + url[:url.rfind("/")].replace(
                            "/thumb/", "/")
                        urls.append(image_url)
                        print_prefix += "-alola"
                        print(f"{print_prefix:<20} > {image_url}")
                        alola += 1
                        break
            else:
                print(f"{print_prefix:<20} > Could not find a URL")
                fail += 1
        else:
            print(f"{print_prefix:<20} > {r.status_code} {r.reason}")
            fail += 1
    elapsed_time = time.perf_counter() - start
    print(
        f"\nFound {success + alola}/{success + fail + alola} ({success} + {alola}) URLs in {elapsed_time:0.02f} seconds.\n"
    )
    return urls
Ejemplo n.º 3
0
    parser.add_argument("--url",
                        type=str,
                        help="The link to the repo that you want to check")
    parser.add_argument("--show-invalid-only",
                        action="store_true",
                        help="By default, only invalid urls are printed.")

    args = parser.parse_args()

    url = args.url
    if url.endswith(".git"):
        url = url[:-4]
    giturl = osp.join(url, "blob/master")
    if osp.exists("temp/"):
        shutil.rmtree("temp/")
    git.Repo.clone_from(url, "temp")

    from utils import extract_urls, test_url_availability

    for url, fname, lidx in extract_urls(folder="temp/"):
        available = test_url_availability(url)
        if available and args.show_invalid_only:
            continue
        status = "valid" if available else "invalid"
        print("[%s]" % status, url)
        rel_path = "/".join(fname.split("/")[1:])
        print("\t", "It is in %s#L%d" % (osp.join(giturl, rel_path), lidx + 1))

    if osp.exists("temp/"):
        shutil.rmtree("temp/")
Ejemplo n.º 4
0
def prominent_domains(js, keyw, domains=set(), extend_search=True):
    """
    Given keywords of a site and domains returned by a google search, check
    which domains are found from the site. Split the result on primary and
    secondary targets.

    Primary target is a domain that is found from the keywords or in the
    domains guesses from the domain.  Secondary target is a domain that is not
    found from the keywords, but appears elsewhere in the site.

    Parameters
    ----------
    js : json object
        contains site data
    keyw : list
        contains site keywords
    domains : set
        set of tuples (mld, ps)
    extend_search : boolean
        whether to look for prominent domains from text and links as well

    Returns
    -------
    prominent : set
        set of string "mld.ps" that either appear in keywords or can be guessed from the keywords
    """

    prominent = set()
    mld_guesses = keywords.guess_mld(js)

    url_tokens = re.split('\W+',
                          (js['starturl'] + ' ' + js['landurl']).lower())
    title_tokens = re.split('\W+', js['title'].lower())

    # logger.print("checking for prominent domains:")
    for mld, ps in domains:
        mld = mld.lower()
        ps = ps.lower()
        # segments = ngrams.segment(mld)
        if mld in keyw:
            logger.print("mld found from keywords: {}.{}".format(mld, ps),
                         nots=True)
            prominent.add('.'.join([mld, ps]))
            # prominent.add((mld, ps))
        elif mld in mld_guesses:
            logger.print("mld found from mld-guessing: {}.{}".format(mld, ps),
                         nots=True)
            prominent.add('.'.join([mld, ps]))
            # prominent.add((mld, ps))
        # elif extend_search and ' '.join(segments) in ' '.join(js['text'].lower().split()) and mld not in STOPMLDS:
        #     logger.print("found by segmentation from text: {}.{}".format(mld, ps), nots=True)
        #     prominent.add('.'.join([mld, ps]))
        # elif all(item in title_tokens for item in segments):
        #     logger.print("found by segmentation from title: {}.{}".format(mld, ps), nots=True)
        #     prominent.add('.'.join([mld, ps]))
        #     # prominent.add((mld, ps))
        elif mld in url_tokens:
            logger.print("mld in url: {}.{}".format(mld, ps), nots=True)
            prominent.add('.'.join([mld, ps]))
            # prominent.add((mld, ps))

    if extend_search:
        link_domains = set(
            keywords.split_mld_ps(link)
            for link in utils.extract_urls(js['source']))
        link_domains |= set(
            keywords.split_mld_ps(link) for link in js['loglinks'])
        # remove mlds that often occur: google, blogger, ... These are STOPMLDS
        link_domains = set(
            (mld, ps) for (mld, ps) in link_domains if mld not in STOPMLDS)

        for dom in domains:
            if dom in link_domains and dom not in prominent:
                logger.print("mld found from links: {}.{}".format(*dom),
                             nots=True)
                prominent.add('.'.join(dom))
                # prominent.add(dom)

    return prominent
Ejemplo n.º 5
0
def url_worker(urlinput, urloutput):
    for comment_id, body in iter(urlinput.get, 'STOP'):
        url_set = extract_urls(body)
        urloutput.put((comment_id, url_set))
Ejemplo n.º 6
0
import git, os
from utils import extract_urls, test_url_availability

# os.makedirs("temp", exist_ok=True)
# git.Repo.clone_from("https://github.com/mit-han-lab/proxylessnas.git", "temp")
# git.Repo.clone_from("https://github.com/mit-han-lab/AMC.git", "temp")
giturl = "https://github.com/mit-han-lab/AMC/blob/master/"

for url, fname, lidx in extract_urls(folder="."):
    print(url)
    print("\t", test_url_availability(url))
    print("\t", "%s%s#L%d" % (giturl, fname.replace("temp/", ""), lidx + 1))