Esempi in Python per convert_date, esempi in Python per ExtensionCrawler.dbbackend.mysql_backend.convert_date

Esempio n. 1

0

Mostra file

File: db.py Progetto: logicalhacking/ExtensionCrawler

def get_etag(ext_id, datepath, con):
    # Trying to parse etag file
    etagpath = next(
        iter(glob.glob(os.path.join(datepath, "*.crx.etag"))), None)
    if etagpath:
        with open(etagpath) as f:
            return f.read()

    # Trying to parse header file for etag
    headerpath = next(
        iter(glob.glob(os.path.join(datepath, "*.crx.headers"))), None)
    if headerpath:
        with open(headerpath) as f:
            content = f.read()
            try:
                headers = ast.literal_eval(content)
                if "ETag" in headers:
                    return headers["ETag"]
            except Exception:
                log_warning("* WARNING: could not parse crx header file", 3)

    # Trying to look up previous etag in database
    linkpath = next(
        iter(glob.glob(os.path.join(datepath, "*.crx.link"))), None)
    if linkpath:
        with open(linkpath) as f:
            link = f.read()
            linked_date = link[3:].split("/")[0]

            result = con.get_etag(ext_id, convert_date(linked_date))
            if result is not None:
                return result

    return None

Esempio n. 2

0

Mostra file

File: db.py Progetto: logicalhacking/ExtensionCrawler

def parse_and_insert_replies(ext_id, date, repliespath, con):
    log_debug("- parsing reply file", 3)
    with open(repliespath) as f:
        d = json.load(f)
        if "searchResults" not in d:
            log_warning("* WARNING: there are no search results in {}".format(repliespath), 3)
            return
        for result in d["searchResults"]:
            if "annotations" not in result:
                continue
            for annotation in result["annotations"]:
                comment = get(annotation, "comment")
                if comment is not None:
                    commentmd5 = hashlib.md5(comment.encode()).digest()
                    con.insert(
                        "reply",
                        extid=ext_id,
                        date=convert_date(date),
                        commentdate=datetime.datetime.utcfromtimestamp(
                            get(annotation, "timestamp")).isoformat()
                        if "timestamp" in annotation else None,
                        replyto=get(
                            get(get(annotation, "entity"), "annotation"),
                            "author"),
                        commentmd5=commentmd5,
                        displayname=get(
                            get(annotation, "entity"), "displayName"),
                        author=get(get(annotation, "entity"), "author"),
                        language=get(annotation, "language"),
                        shortauthor=get(
                            get(annotation, "entity"), "shortAuthor"))
                    con.insert(
                        "reply_comment",
                        commentmd5=commentmd5,
                        comment=comment)

Esempio n. 3

0

Mostra file

File: db.py Progetto: logicalhacking/ExtensionCrawler

def parse_and_insert_support(ext_id, date, supportpath, con):
    log_debug("- parsing support file", 3)
    with open(supportpath) as f:
        content = f.read()
        stripped = content[content.find('{"'):]
        d = json.JSONDecoder().raw_decode(stripped)
        annotations = get(next(iter(d), None), "annotations")
        if annotations:
            for review in d[0]["annotations"]:
                comment = get(review, "comment")
                if comment is not None:
                    commentmd5 = hashlib.md5(comment.encode()).digest()
                    con.insert(
                        "support",
                        extid=ext_id,
                        date=convert_date(date),
                        commentdate=datetime.datetime.utcfromtimestamp(
                            get(review, "timestamp")).isoformat()
                        if "timestamp" in review else None,
                        title=get(review, "title"),
                        commentmd5=commentmd5,
                        displayname=get(get(review, "entity"), "displayName"),
                        author=get(get(review, "entity"), "author"),
                        language=get(review, "language"),
                        shortauthor=get(get(review, "entity"), "shortAuthor"))
                    con.insert(
                        "support_comment",
                        comment=comment,
                        commentmd5=commentmd5)

Esempio n. 4

0

Mostra file

File: db.py Progetto: logicalhacking/ExtensionCrawler

def parse_and_insert_status(ext_id, date, datepath, con):
    log_debug("- parsing status file", 3)
    overview_status = get_overview_status(datepath)
    crx_status = get_crx_status(datepath)

    overviewexceptionpath = os.path.join(datepath, "overview.html.exception")
    overview_exception = None
    if os.path.exists(overviewexceptionpath):
        with open(overviewexceptionpath) as f:
            overview_exception = f.read()

    con.insert(
        "status",
        extid=ext_id,
        date=convert_date(date),
        crx_status=crx_status,
        overview_status=overview_status,
        overview_exception=overview_exception)

Esempio n. 5

0

Mostra file

File: db.py Progetto: logicalhacking/ExtensionCrawler

def parse_and_insert_overview(ext_id, date, datepath, con):
    log_debug("- parsing overview file", 3)
    overview_path = os.path.join(datepath, "overview.html")
    if os.path.exists(overview_path):
        with open(overview_path) as overview_file:
            contents = overview_file.read()

            # Extract extension name
            match = re.search("""<meta itemprop="name" content="(.*?)"\s*/>""",
                              contents)
            name = match.group(1) if match else None

            # Extract extension version
            match = re.search(
                """<meta itemprop="version" content="(.*?)"\s*/>""", contents)
            version = match.group(1) if match else None

            match = re.search(
                """<meta itemprop="ratingValue" content="(.*?)"\s*/>""",
                contents)
            rating = float(match.group(1)) if match else None

            match = re.search(
                """<meta itemprop="ratingCount" content="(.*?)"\s*/>""",
                contents)
            rating_count = int(match.group(1)) if match else None

            # Extracts extension categories
            match = re.search(
                """Attribute name="category">(.+?)</Attribute>""", contents)
            categories = match.group(1).split(",") if match else None

            # Extracts the number of downloads
            match = re.search(
                """<meta itemprop="interactionCount" content="UserDownloads:((:?\d|,)+)""",
                contents)
            downloads = int(match.group(1).replace(",", '')) if match else None

            # Extracts the full extension description as it appears on the
            # overview page
            doc = BeautifulSoup(contents, 'html.parser')

            description_parent = doc.find('div', itemprop="description")
            description = str(
                description_parent.contents[0]
            ) if description_parent and description_parent.contents else None
            full_description = str(
                description_parent.parent) if description_parent else None

            offeredby_parent = doc.find(
                class_=lambda cls: cls and "e-f-Me" in cls)
            offeredby = "".join([str(x) for x in offeredby_parent.contents
                                 ]) if offeredby_parent else None

            developer_parent = doc.find(
                class_=lambda cls: cls and "C-b-p-rc-D-J" in cls)
            developer = "".join([str(x) for x in developer_parent.contents
                                 ]) if developer_parent else None

            last_updated_parent = doc.find(
                class_=lambda cls: cls and "h-C-b-p-D-xh-hh" in cls)
            last_updated = str(last_updated_parent.contents[
                0]) if last_updated_parent else None

            etag = get_etag(ext_id, datepath, con)

            match = re.search(
                """<Attribute name="item_category">(.*?)</Attribute>""",
                contents)
            itemcategory = match.group(1) if match else None


            con.insert(
                "extension",
                extid=ext_id,
                date=convert_date(date),
                name=name,
                version=version,
                description=description,
                downloads=downloads,
                rating=rating,
                ratingcount=rating_count,
                fulldescription=full_description,
                offeredby=offeredby,
                developer=developer,
                itemcategory=itemcategory,
                crx_etag=etag,
                lastupdated=last_updated)

            if categories:
                for category in categories:
                    con.insert(
                        "category",
                        extid=ext_id,
                        date=convert_date(date),
                        category_md5=hashlib.md5(category.encode()).digest(),
                        category=category)