Ejemplo n.º 1
0
def fetch_issues(state, since):
    """Fetch webcompat issues from Github."""

    GITHUB_OWNER = settings.GITHUB_OWNER
    GITHUB_REPO = settings.GITHUB_REPO

    g = Github(settings.GITHUB_API_TOKEN)
    org = g.get_organization(GITHUB_OWNER)
    repo = org.get_repo(GITHUB_REPO)
    kwargs = {"state": state}

    # Get last updated timestamp
    last_updated_timestamp = get_last_updated_timestamp()
    if since or last_updated_timestamp:
        kwargs["since"] = dateutilparse(since or last_updated_timestamp)

    issues = repo.get_issues(**kwargs)

    es = Elasticsearch([settings.ES_URL], **settings.ES_KWARGS)
    es.indices.create(index=settings.ES_WEBCOMPAT_INDEX, ignore=400)

    for i in issues:

        try:
            click.echo("Fetching issue: {}".format(i.id))

            # Prepare ES document object
            body = i.raw_data
            headers = {
                "Authorization": "token {}".format(settings.GITHUB_API_TOKEN)
            }
            response = requests.get(body["events_url"], headers=headers)
            response.raise_for_status()
            events_raw = response.json()

            # Query issue title and body to extract domains
            domains = set()
            domains.update(re.findall(FQDN_REGEX, i.title))
            domains.update(re.findall(FQDN_REGEX, i.body))

            body.update({"events": events_raw})
            body.update({"domains": list(domains)})
            body.update({"valid_domains": get_valid_domains(list(domains))})
            body.update({"parsed_url": get_parsed_url(i.body)})
            body.update(get_extracted_fields(i.body))
            es.index(
                index=settings.ES_WEBCOMPAT_INDEX,
                doc_type="webcompat_issue",
                id=i.number,
                body=body,
            )
        except Exception as e:
            click.echo(str(e), err=True)
            continue
Ejemplo n.º 2
0
def checkDomainAge(addr):
    try:
        registration = whois.whois(addr)
        created = registration.creation_date
        if created is not None:
            if isinstance(created, list):
                created = created[0]
            if isinstance(created, str):
                if created.startswith("before "):
                    created = created[7:]
                created = dateutilparse(created)
            if isinstance(created, datetime.date):
                age = (datetime.datetime.now() - created).days
                if age > 180:
                    return "older"
    except whois.parser.PywhoisError:
        pass
    return "< 6 month"
def cast_val(value, directive):
    # try:
    if directive == "Integer":
        if value.lower() == "false":
            return 0
        elif value.lower() == "true":
            return 1
        else:
            try:
                return int(value)
            except ValueError:
                return None
    elif directive == "Float":
        try:
            return float(re.sub("[^0-9\.\-]", "", value))
        except ValueError:
            return float(value)
    elif directive == "Time":
        if len(value) == 10 and sum(c.isdigit() for c in value) == 10:
            return int(
                time.mktime(
                    datetime.datetime.fromtimestamp(int(value)).timetuple()))
        elif len(value) == 13 and sum(c.isdigit() for c in value) == 13:
            return int(
                time.mktime(
                    datetime.datetime.fromtimestamp(int(value) /
                                                    1000).timetuple()))
        else:
            return int(
                time.mktime(dateutilparse(value, fuzzy=True).timetuple()))
    elif directive == "Text" or directive == "Phrase":
        return [
            PorterStemmer().stem(word) for word in clean_str(value).split(" ")
        ]
    elif directive == "Categorical":
        return value
Ejemplo n.º 4
0
        metadata = {}
        metadata["timestamp"] = 0
        with open(os.path.join(e[0], f), 'r') as fd:
            metadata['relpath'] = os.path.relpath(os.path.join(e[0], f), cur_dir)
            for line in reversed(fd.readlines()):
                if line.count(end_sigil) > 0:
                    break
                if line.count(':') == 0:
                    continue
                key, content = line.split(':', 1)
                if key == 'Tags':
                    metadata[key] = [t.strip() for t in content.split(',')]
                elif key in known_md:
                    metadata[key] = content.strip()
            if "Date" in metadata.keys():
                ts = dateutilparse(metadata["Date"]).timestamp()
                print("Parsed Date: {}".format(ts))
                metadata["timestamp"] = ts
        blogs.append(metadata)


def tags_to_s(tags):
    if not tags:
        tags = ['untagged']
    return "".join(["'", "', '".join(tags), "'"])


def bloglink(b):
    return "[{}]({})".format(b["Title"], b["relpath"])

Ejemplo n.º 5
0
def _parse_entries(hardict):
    """
    Parse all entries, grouping those that have a page reference into
    page blocks. This will also handle requests occurring outside the
    scope of a page, retaining the original call timing.

    Args:
        hardict (dict): HAR dictionary
    Returns:
        luastr (str): Lua code snippet with page

    """
    # make sure pages and events are sorted in start-order since this
    # is not guaranteed by the HAR standard. This also hadles events
    # that occur outside of the context of a page.
    pages = hardict["log"].get("pages", [])
    data = sorted(pages + hardict["log"]["entries"],
                  key=lambda dat: dateutilparse(dat["startedDateTime"]))

    # we want to save the time-deltas until the *next* entry in the list
    if len(data) > 1:
        dtimes = [dateutilparse(dat["startedDateTime"]) for dat in data]
        dtimes = [dtimes[i + 1] - dtimes[i] for i in range(len(dtimes) - 1)]
    else:
        dtimes = [0]

    # we make a page->event mapping for quick lookup
    entries = defaultdict(list)
    for entry in sorted(hardict["log"]["entries"],
                        key=lambda ev: dateutilparse(ev["startedDateTime"])):
        entries[entry.get("pageref", None)].append(entry)

    lua = []
    for idat, dat in enumerate(data):
        comment = dat.get("comment", "")
        if "id" in dat:
            # a page.
            title = dat["title"]
            pageref = dat["id"]
            comment = "%s (HAR pageref '%s')%s" % (
                title, pageref, " (Comment: %s)" % comment if comment else "")
            body = []
            entry_time = 0
            for entrydict in entries[pageref]:
                body.append(_get_entry(entrydict, batch=True))
                entry_time += entrydict["time"] if entrydict["time"] > 0 else 0
            if body:
                lua.append(
                    _LUA_PAGE.safe_substitute(comment=comment,
                                              pageref=pageref,
                                              body=",\n\n".join(body)))
            dtime = dtimes[idat]
            if dtime.microseconds > 0:
                # we should sleep before triggering the next page in
                # order to best emulate the user case we recorded. But
                # since the batch-requests block we must remove the
                # time that has already passed from page load.
                onload = dat["pageTimings"].get("onLoad", -1)
                onload = onload if onload and onload >= 0 else 0
                comment = dat["pageTimings"].get("comment", "")
                # Note - setting 10 ms as minimum sleep and assuming
                # entry time and onload time are independent of each
                # other.
                sleeptime = max(10, dtime.microseconds - entry_time - onload)
                lua.append([
                    "-- pause until next page%s." %
                    ((" (Comment: %s)" % comment) if comment else ""),
                    "client.sleep(%s, 1000)" % sleeptime
                ])

        elif "pageref" not in dat:
            # an entry outside the scope of a page
            comment = "Request outside page%s" % (" (Comment: %s" %
                                                  comment if comment else "")
            lua.append(
                _LUA_SINGLE.safe_substitute(comment=comment,
                                            body=_get_entry(dat)))

    return "\n".join(lua)
        "-o",
        "--older",
        help=
        "Abort those PV's whose workflow started more that this many days ago. To abort all PV's, specify 0.",
        default=7,
        type=int)
    args = parser.parse_args()
    if not args.url.endswith('bpl'):
        print(
            "The URL needs to point to the mgmt bpl; for example, http://arch.slac.stanford.edu/mgmt/bpl. ",
            args.url)
        sys.exit(1)
    neverConnectedPVs = requests.get(args.url + '/getNeverConnectedPVs').json()
    for neverConnectedPV in neverConnectedPVs:
        abort = False
        if args.older == 0:
            abort = True
        elif "startOfWorkflow" in neverConnectedPV:
            startOfWorkflow = dateutilparse(
                neverConnectedPV["startOfWorkflow"])
            if (datetime.datetime.now(tzlocal()) -
                    startOfWorkflow).total_seconds() >= (args.older * 86400):
                abort = True

        if abort:
            print("Aborting PV %s " % neverConnectedPV['pvName'])
            aresp = requests.get(args.url + '/abortArchivingPV',
                                 params={"pv": neverConnectedPV['pvName']})
            aresp.raise_for_status()
            time.sleep(0.25)