Ejemplos de Scraper en Python, ejemplos de ccl_scratch_tools.Scraper en Python

Ejemplo n.º 1

0

Mostrar archivo

def studio():
    if request.method == "GET":
        common.connect_db()
        return render_template("studio.html",
                               schemas=list(schema.Challenge.objects().order_by("-modified")))  # yapf: disable
    else:
        scraper = Scraper()
        sid = scraper.get_id(request.form["studio"])

        s = None
        if request.form["schema"] != "__none__":
            s = request.form["schema"]

        if request.form["studio"] == "__all__":
            scrape.rescrape_all.delay(cache_directory=CACHE_DIRECTORY)
            return "Started"
        elif sid is not None:
            scrape.add_studio.delay(sid,
                                    schema=s,
                                    show=("show" in request.form),
                                    cache_directory=CACHE_DIRECTORY)
            return redirect("/studio/{0}".format(sid))
        else:
            return render_template(
                "studio.html",
                message="Please enter a valid studio ID or URL.")

Ejemplo n.º 2

0

Mostrar archivo

def test_studio():
    scrape = Scraper()
    projects = scrape.get_projects_in_studio(26211962)

    p_to_s = dict()
    for project in projects:
        p_to_s[project] = 26211962

    return projects, p_to_s

Ejemplo n.º 3

0

Mostrar archivo

def project_download():
    if request.form["sid"] is None or request.form["pid"] is None:
        return "False"
    sid = request.form["sid"]
    pid = request.form["pid"]

    scraper = Scraper()
    try:
        pid = int(pid)
    except:
        return "False"

    if pid in scraper.get_projects_in_studio(sid):
        return str(scrape.add_project(pid, sid, CACHE_DIRECTORY))
    else:
        return "False"

Ejemplo n.º 4

0

Mostrar archivo

Archivo: summary.py Proyecto: GSE-CCL/getting-unstuck-web

def generate_summary_page(credentials_file=settings.DEFAULT_CREDENTIALS_FILE):
    """Performs all the aggregation required to generate the summary page.
    
    Args:
        credentials_file (str): path to the database credentials file.

    Returns:
        None.
    """

    logging.info("starting to aggregate summary statistics")

    # Stitch the project images together
    img = get_stitched(get_image_urls(), 16, w=96, h=72)
    img.save(
        "{}/data/projects.jpg".format(settings.CACHE_DIRECTORY),
        dpi=(72, 72),  # disable: yapf
        quality=75)

    logging.info("project image stitch saved, starting on data gathering")

    # Get the data
    now = datetime.now()
    studios = get_ordered_studios()
    studio_ids = [s["studio_id"] for s in studios]
    engagement = get_total_engagement(studio_ids)
    data = {
        "project_counts":
        [s["stats"]["total"]["number_projects"] for s in studios],
        "nations": get_author_origins(get_unique_authors(studio_ids)),
        "totals": {
            "block_count":
            sum([s["stats"]["total"]["block_count"] for s in studios]),
            "categories":
            get_total_categories(studios),
            "comments":
            sum([s["stats"]["total"]["comments_left"] for s in studios]),
            "description":
            sum([s["stats"]["total"]["description_words"] for s in studios]),
            "hearts_stars":
            engagement["loves"] + engagement["favorites"],
            "projects":
            sum([s["stats"]["total"]["number_projects"] for s in studios]),
            "unique_authors":
            len(get_unique_authors(studio_ids))
        },
        "updated": now.strftime("%A, %B %d, %Y")
    }

    with open("{}/lib/data/summary.json".format(settings.PROJECT_DIRECTORY)) as f:  # yapf: disable
        static = json.load(f)
        data["static"] = static["statistics"]

    if Scraper().make_dir("{}/data".format(settings.CACHE_DIRECTORY)):
        with open("{}/data/summary.json".format(settings.CACHE_DIRECTORY), "w") as f:  # yapf: disable
            json.dump(data, f)

    logging.info("completed aggregating summary statistics")
    return True

Ejemplo n.º 5

0

Mostrar archivo

Archivo: summary.py Proyecto: GSE-CCL/getting-unstuck-web

def get_author_origins(authors):
    """Gets the origin locations of project authors.
    
    Args:
        authors (array-like): a set of authors for whom origin locations are to be counted.

    Returns:
        A dictionary mapping countries to number of authors from there.
    """

    nations = dict()
    scraper = Scraper()
    for author in authors:
        user = scraper.get_user_info(author)
        if user["profile"]["country"] in nations:
            nations[user["profile"]["country"]] += 1
        else:
            nations[user["profile"]["country"]] = 1

    return nations

Ejemplo n.º 6

0

Mostrar archivo

def add_comments(project_id,
                 username,
                 credentials_file=settings.DEFAULT_CREDENTIALS_FILE):
    """Inserts a project's comments into the database. These are public comments on the project itself, not code comments.
    
    Args:
        project_id (int): the ID of the project whose comments we're scraping.
        username (str): the username of the user who created the project.
        credentials_file (str): path to the database credentials file.

    Returns:
        None.
    """

    # DB connection
    connect_db(credentials_file=credentials_file)

    # Scrape comments
    scraper = Scraper()
    comments = scraper.get_project_comments(project_id)

    for comment in comments:
        preexisting = Comment.objects(project_id=project_id,
                                      comment_id=comment["id"]).first()

        if not preexisting:
            timestamp = datetime.strptime(comment["timestamp"],
                                          "%Y-%m-%dT%H:%M:%SZ")
            doc = Comment(comment_id=comment["id"],
                          project_id=project_id,
                          date=timestamp,
                          author=comment["username"].lower(),
                          recipient=username.lower(),
                          content=comment["comment"])
            doc.save()

    logging.debug("successfully scraped comments for project {}".format(project_id))  # yapf: disable

Ejemplo n.º 7

0

Mostrar archivo

Archivo: certificate.py Proyecto: GSE-CCL/getting-unstuck-web

def generate_certs(usernames,
                   credentials_file=settings.DEFAULT_CREDENTIALS_FILE,
                   cache_directory=settings.CACHE_DIRECTORY):
    """Initiates the generation of all Getting Unstuck certificates.

    Args:
        usernames (array-like): list of usernames to create and scrape certificates for.
        credentials_file (str): path to the database credentials file.   
        cache_directory (str): if set, will save this certificate into the cache directory specified.

    Returns: 
        None.
    """

    Scraper().make_dir(f"{cache_directory}/certificates")

    logging.info("attempting to generate certificates")
    connect_db(credentials_file=credentials_file)

    # Get schema IDs, and add to a reusable query that will get all the projects that have one of the schemas
    schema_ids = scrape.Studio.objects(
        public_show=True).values_list("challenge_id")
    query = []
    for schema_id in schema_ids:
        query.append({f"validation.{schema_id}": {"$exists": True}})
    projects = scrape.Project.objects(__raw__={"$or": query})

    # Loop through each username to generate certificate
    for username in usernames:
        # Get number of projects completed
        author_count = projects.filter(author=username).count()
        if author_count > 10:
            logging.info(
                "certificate for {} has more than 10 projects! reset to 10".
                format(username))
            author_count = 10

        # Generate certificate
        cert_download = convert_cert("pdf.html", username, author_count,
                                     cache_directory)

        if not cert_download:
            logging.info("certificate download failed for {}".format(username))

    logging.info("certificate generation completed!")

Ejemplo n.º 8

0

Mostrar archivo

def main():
    scrape = Scraper()
    arguments = get_arguments()
    projects, projects_to_studio = get_project_ids(scrape, arguments)

    if arguments.output_directory is None:
        scrape.download_projects(projects,
                                 projects_to_studio,
                                 file_name=arguments.output_name)
    else:
        scrape.download_projects(projects,
                                 projects_to_studio,
                                 output_directory=arguments.output_directory,
                                 file_name=arguments.output_name)

Ejemplo n.º 9

0

Mostrar archivo

def scraper():
    return Scraper()

Ejemplo n.º 10

0

Mostrar archivo

def add_studio(studio_id,
               schema=None,
               show=False,
               cache_directory=None,
               credentials_file=settings.DEFAULT_CREDENTIALS_FILE):
    """Scrapes a studio and inserts it into the database.
    
    Args:
        studio_id (int): the ID of the studio to scrape.
        schema (str): the object ID of the schema associated with this studio.
        show (bool): whether to show the studio on the public Challenges page.
        cache_directory (str): if set, will save this project
            JSON into the cache directory specified.
        credentials_file (str): path to the database credentials file.
    
    Returns:
        None.

    Raises:
        IOError: if couldn't write the JSON file to the given cache_directory.
    """

    # Load scraper class
    scraper = Scraper()

    # Add individual studio to DB
    studio_info = scraper.get_studio_meta(studio_id)
    if studio_info is not None:
        logging.info("attempting to scrape studio {}".format(studio_id))
        connect_db(credentials_file=credentials_file)

        preexisting = Studio.objects(studio_id=studio_id).first()
        if preexisting:
            # Update a few fields
            doc = preexisting
            doc.title = studio_info["title"]
            doc.description = studio_info["description"]
            doc.status = "in_progress"

            if show is not None:
                doc.public_show = show
        else:
            # New studio altogether
            doc = Studio(studio_id=studio_id,
                         title=studio_info["title"],
                         description=studio_info["description"],
                         status="in_progress",
                         public_show=show)

        if schema is not None:
            doc.challenge_id = schema

        doc.save()

        # Add all the projects
        project_ids = scraper.get_projects_in_studio(studio_id)

        # Delete projects no longer in studio
        delete = Project.objects(studio_id=studio_id,
                                 project_id__nin=project_ids)
        logging.info("deleting {} projects no longer in studio {}"
                     .format(delete.count(),
                             studio_id))
        delete.delete()

        # Add to studio
        for i, project in enumerate(project_ids):
            add_project(project,
                        studio_id=studio_id,
                        cache_directory=cache_directory,
                        credentials_file=credentials_file)
            if i % 10 == 0:
                logging.info("completed {}/{} projects in studio {}"
                             .format(i,
                                     len(project_ids),
                                     studio_id))

        stats = get_studio_stats(studio_id, credentials_file=credentials_file)

        preexisting = Studio.objects(studio_id=studio_id).first()
        if preexisting is not None:
            preexisting.status = "complete"
            preexisting.stats = stats
            preexisting.save()

        logging.info("successfully scraped studio {}".format(studio_id))

Ejemplo n.º 11

0

Mostrar archivo

def add_project(project_id,
                studio_id=0,
                cache_directory=None,
                credentials_file=settings.DEFAULT_CREDENTIALS_FILE):
    """Inserts a project into the database after scraping it. Updates existing database entries.
    
    Args:
        project_id (int): the ID of the project to scrape.
        studio_id (int): the studio ID with which this project should be associated.
        cache_directory (str): if set, will save this project
            JSON into the cache directory specified.
        credentials_file (str): path to the database credentials file.
    
    Returns:
        True, if a new insertion or if updated a record. False if Scratch 2.

    Raises:
        IOError: if couldn't write the JSON file to the given cache_directory.
    """

    # Gather information about the project
    scraper = Scraper()
    metadata = scraper.get_project_meta(project_id)

    # Handle error from trying to decode ZIPs
    try:
        scratch_data = scraper.download_project(project_id)
    except RuntimeError:
        scratch_data = dict()

    # Convert to SB3 if possible
    parser = Parser()

    if not parser.is_scratch3(scratch_data) and settings.CONVERT_URL != "":
        try:
            r = requests.post(settings.CONVERT_URL, json=scratch_data)
            scratch_data = json.loads(r.json())
        except:
            pass

    # Save to cache if needed
    if cache_directory is not None:
        if scraper.make_dir(f"{cache_directory}/projects"):
            name = "{0}/projects/{1}.json".format(cache_directory, project_id)  # yapf: disable
            with open(name, "w") as f:
                try:
                    json.dump(scratch_data, f)
                except:
                    raise IOError(
                        "Couldn't write the JSON file to directory {0}".format(cache_directory))   # yapf: disable

    # Parse the project using the parser class
    try:
        if parser.is_scratch3(scratch_data):
            stats = parser.blockify(scratch_data=scratch_data)
            if stats["blocks"] == False or stats["categories"] == False:
                stats = False
        else:
            stats = False
    except:
        stats = False

    if not stats:
        logging.warning("Couldn't get statistics for project {}".format(project_id))   # yapf: disable
        return False

    # Change block_text's form
    text_new = {"text": [], "blocks": []}
    for text in stats["block_text"]:
        text_new["text"].append(text)
        text_new["blocks"].append(stats["block_text"][text])
    stats["block_text"] = text_new

    # Check database for existing project with project_id
    connect_db(credentials_file=credentials_file)
    preexisting = Project.objects(project_id=project_id).first()

    if preexisting:
        # Update a few fields
        doc = preexisting
        doc.title = metadata["title"]
        doc.description = metadata["description"]
        doc.instructions = metadata["instructions"]
        doc.author = metadata["author"]["username"].lower()
        doc.image = metadata["image"]
        doc.history = metadata["history"]
        doc.remix = metadata["remix"]
        doc.stats = stats
        doc.engagement = metadata["stats"]

        if studio_id > 0:
            doc.studio_id = studio_id

        if cache_directory is not None:
            doc.cache_expires = datetime.now() + timedelta(days=30)
    else:
        # Create a new record
        doc = Project(project_id=project_id,
                      title=metadata["title"],
                      description=metadata["description"],
                      instructions=metadata["instructions"],
                      author=metadata["author"]["username"].lower(),
                      image=metadata["image"],
                      history=metadata["history"],
                      remix=metadata["remix"],
                      engagement=metadata["stats"],
                      studio_id=studio_id,
                      stats=stats)

    doc.save()
    add_comments(project_id,
                 metadata["author"]["username"].lower(),
                 credentials_file=credentials_file)

    # Validate against studio's schema, if available
    if studio_id > 0:
        challenge = Studio.objects(
            studio_id=studio_id).only("challenge_id").first()
        if challenge is not None and challenge["challenge_id"] is not None:
            validation = schema.validate_project(
                challenge["challenge_id"],
                project_id,
                studio_id,
                credentials_file=credentials_file)
            del validation["_id"]
            doc.validation[str(challenge["challenge_id"])] = validation
            doc.save()

    logging.debug("successfully scraped project {}".format(project_id))

    return True

Ejemplo n.º 12

0

Mostrar archivo

def get_project_page(pid, cache_directory=settings.CACHE_DIRECTORY):
    """Get a project page rendered in HTML given a project ID.
    
    Args:
        pid (int): project ID.
        cache_directory (str): the directory where cached projects are stored.
        
    Returns:
        A string containing the HTML for the page.
    """

    # Load in the project db, project JSON, studio info, and schema
    project, scratch_data = scrape.get_project(pid, cache_directory)

    if len(project) == 0 or len(scratch_data) == 0:
        message = 'We couldn&rsquo;t find your project! \
            <a href="/project/r/{}">Try again</a>'.format(pid)
        return render_template("project_loader.html", message=message)

    studio = scrape.get_studio(project["studio_id"])

    if "challenge_id" in studio:
        sc = schema.get_schema(studio["challenge_id"])

        # Determine whether there's an error here
        err = False
        if str(studio["challenge_id"]) in project["validation"]:
            project["validation"] = project["validation"][str(studio["challenge_id"])]  # yapf: disable
        else:
            err = True

        # Show error page
        if project == {} or scratch_data == {} or studio == {} or sc == {} or err:
            raise NotFound()

        # Prepare helper tools
        scraper = Scraper()
        visualizer = Visualizer()

        # Convert Markdown to HTML with Scratchblocks
        if "text" in sc:
            for key in sc["text"]:
                sc["text"][key] = common.md(sc["text"][key])

        # Get the code excerpt for the projects to be shown
        excerpts = dict()
        examples = get_comparisons(project, sc, 5) + [project]
        for example in examples:
            code, sprite = get_code_excerpt(example, sc)
            excerpts[example["project_id"]] = {
                "author": example["author"],
                "code": code,
                "sprite": sprite
            }

        # Get the saved reflection, if any
        _reflections = scrape.ProjectReflection.objects(
            project_id=pid).order_by("-timestamp")
        try:
            reflection = _reflections.first().to_mongo().to_dict()
            reflection["editable"] = True if reflection[
                "gu_uid"] == request.cookies.get("_gu_uid") else False
        except:
            reflection = dict()
    else:
        sc = dict()
        excerpts = dict()
        reflection = dict()

    # One prompt variable to take the logic out of the templating language
    prompt = {
        "title":
        sc["title"] if "title" in sc and sc["title"] is not None else
        studio["title"] if "title" in studio else None,
        "description":
        sc["description"] if "description" in sc else
        studio["description"] if "description" in studio else None
    }

    # Choose stats to show
    studio["stats"] = get_studio_stats(sc, studio)

    # Get the feels
    feels = get_feels(randomize=True)

    return render_template("project.html",
                           prompt=prompt,
                           project=project,
                           studio=studio,
                           schema=sc,
                           excerpts=excerpts,
                           feels=feels,
                           reflection=reflection)