Esempio n. 1
0
    def __init__(self):
        config_file = os.path.join(
            os.path.dirname(__file__), ".kairos", "config.json")
        if not config_file.endswith(".json"):
            raise ValueError(f"Not a json file: {config_file}")
        config_file = normalize_filepath(config_file)
        if not isfile(config_file):
            print(yellow(f"No such file: {config_file}"))
            reinitializing = input(
                green("Do you want me to create it? [y/N] "))
            if reinitializing and reinitializing.strip(
            ) and reinitializing.strip()[0].lower() == 'y':
                makedirs(dirname(config_file), exist_ok=True)
                with open(config_file, 'w') as f:
                    f.write(json.dumps(app_state))
                print(green("Initializing a new Kairos configuration file..."))

        self.path_to_config = config_file
        self.is_loaded, self.data, self.err = self.load_appstate()
        self.available_timestamps = [
            ts_name for ts_name, ts_format in self.data['timestamps'].items()
        ]

        self.last_used_format = self.available_timestamps[
            0] if self.is_loaded else None
Esempio n. 2
0
 def print_available_timestamps(self):
     msg = "Available timestamps are: \n\n"
     left_pad = "     + "
     now = datetime.datetime.now()
     print(green(msg))
     all_ok = True
     for k in self.available_timestamps:
         template = self.data['timestamps'][k]
         ok, rendered, err = self.__render__(template, now)
         if ok:
             timestamp = rendered['timestamp']
             print(
                 f"""{left_pad}{blue(k)}\n          Example: {cyan(timestamp)}\n\n""")
         else:
             print(f"{red(err)}")
             pp(rendered).print()
             all_ok = False
             continue
     return all_ok
Esempio n. 3
0
def extract_entities_with_allennlp(*s):
    model_url = "https://storage.googleapis.com/allennlp-public-models/ner-model-2020.02.10.tar.gz"
    global allennlp_model
    if not "allennlp_model" in globals() or not globals()["allennlp_model"]:
        print(yellow("[ model_init ]"), f" :: Loading AllenNLP NER model...")

        if torch.cuda.is_available():
            cuda_device = 0
        else:
            cuda_device = -1
        allennlp_model = Predictor.from_path(model_url,
                                             cuda_device=cuda_device)
        print(
            yellow(f"[ model_init ] "),
            f" :: CUDA initialized? ",
            [green("YES"), red("NO")][abs(cuda_device)],
        )
        print(yellow(f"[ model_init ] "), f" :: Load complete.")
    print(yellow(f"[ model_predict ]"), f" :: Extracting entities...")
    start = datetime.datetime.now()
    ents = []
    for i, part in enumerate(s):
        if not part:
            continue
        elif len(part) <= 36:
            part = f"{part} . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ."
        curr = []
        try:
            # print(yellow(f"[ model_predict ]"), " :: Next input:")
            # for line in textwrap.wrap(part):
            #     print("                ", cyan(line))

            results = allennlp_model.predict(sentence=part)
            # print(yellow(f"[ model_predict ]"), green(" :: OK"))
        except Exception as e:
            print(yellow(f"[ model_predict ]"),
                  red(f" :: {e.__class__.__name__}! :: {e}"))
            continue
        for word, tag in zip(results["words"], results["tags"]):
            if not re.search(r"(LOC)", tag):
                continue
            elif word.startswith("'") and curr:
                curr[-1] += word
            else:
                curr.append(word)
            if tag[0] in "LU":
                span = " ".join(curr)
                if len(span) >= 3:
                    ents.append(span)
                curr = []
    finish = datetime.datetime.now()
    elapsed = (finish - start).total_seconds()
    mins, secs = elapsed // 60, elapsed % 60
    human_readable = (
        f"{magenta(str(int(mins)).zfill(2))}m {blue(str(int(secs)).zfill(2))}s"
    )
    # print(json.dumps(results))
    print(yellow(f"[ model_predict ]"), f" :: Extraction complete.")
    print(yellow(f"[ model_predict ]"), f" :: Elapsed time : ", human_readable)

    print(
        "======================================================================================================"
    )
    print(green(s))
    print(
        "======================================================================================================"
    )
    cased = defaultdict(list)
    for ent in ents:
        cased[ent.lower()].append(ent)
    for k, v in cased.items():
        cased[k] = list(sorted(v, key=lambda x: v.count(x)))
    freqs = {
        v[-1]: len(v)
        for v in sorted(list(cased.values()), key=len, reverse=True)
    }
    print(blue("Extracted entities:"))
    print(cyan(json.dumps(freqs, indent=4)))
    return freqs
Esempio n. 4
0
    #     continue
    # else:
    shape = None
    if prediction in county_shapes:
        shape = county_shapes[prediction]
    elif prediction in state_shapes:
        shape = state_shapes[prediction]
    elif prediction.startswith("District of Columbia"):
        states = state_shapes.values()
        shape = cascaded_union(list(states)).convex_hull
        print(f"No shape for prediction: {prediction}")

    # else:
    #     print(f"No shape for prediction: {prediction}")
    #     continue
    print(green(prediction), blue(shape))

    total_points = base_scores[audience]
    base = approval_score
    coeff = min(1.0, confidence / 100 + 0.2)
    adjusted = base * coeff * total_points

    print(f"{cyan(row['title'])}")
    print(f"Predicted locale: {magenta(prediction)}")
    print(f"Predicted audience: {magenta(row['audience'])}")
    print(f"Original score: {magenta(row['score'])}")
    print(f"Possible points: {red(total_points)}")
    print(f"Raw score: {yellow(approval_score)}")
    print(f"Geoconfidence: {green(confidence / 100)}")
    print(f"Perplexity penalty: {blue(coeff)}")
    print(f"Percent of total points awarded: {cyan(base * coeff)}")
Esempio n. 5
0
            except Exception as err:
                data = {}
        ok = err is None
        return ok, data, err

def create_timestamp(dt: datetime.datetime, timestamp_format='human_fixedlength') -> str:
    k = Kairos()
    ts = k.create_timestamp(dt, timestamp_format)
    return ts

def parse_timestamp(ts: str, timestamp_format=None) -> str:
    if timestamp_format:
        k = Kairos()
        if timestamp_format in k.available_timestamps:
            template = k.load_template(timestamp_format)
            parsed = ts.strptime(template)
        else:
            parsed = ts.strptime(timestamp_format)
    else:
        parsed = parse(ts)
    return parsed
if __name__ == '__main__':
    k = Kairos()
    print(k.available_timestamps)
    ok = k.print_available_timestamps()
    msg = green("OK") if ok else red("FAIL")
    now = datetime.datetime.now()
    ts = create_timestamp(now)
    print(f"Converted {yellow(now)} to {yellow(ts)}")
    print(msg)
Esempio n. 6
0
        [row['url'] for row in db.query("select url from articles a;")])
    print(f"Loaded {len(list(seen))} seen urls.")
    rows = {
        row["url"]: row
        for row in db.query(
            f"select * from spiderqueue where lastmod is not null order by lastmod desc limit 5000;"
        ) if row["url"] not in seen
    }
    print(f"Found {len(list(rows))} uncrawled urls...")
    urls = list(rows.keys())[0:min(LIMIT, len(list(rows.keys())))]
    random.shuffle(urls)
    # urls = random.sample(urls, k=len(urls))

    print(
        green(
            f"[ process_queue ] :: Added {len(list(rows.keys()))} urls to the queue."
        ))
    responses = fetch_all_responses(urls, MAX_REQUESTS)

    for url, res in responses.items():
        row = rows[url]
        row["prediction"] = "rejected"
        row["mod_status"] = "rejected"
        is_dumpsterfire = row["is_dumpsterfire"]

        if isinstance(res, str):
            row["ok"] = False

            if is_dumpsterfire:
                dumpsterfire.upsert(row, ["url"])
                print(
Esempio n. 7
0
async def main(queue, limit=30):
    crawldb = db["us_metros2"]
    updates = []
    dups = defaultdict(list)
    seen = set()
    print(f"Initializing crawler....")

    async def fetch(name, url, pagetype, parent):
        seen.add(url)
        async with httpx.AsyncClient() as client:
            try:
                res = await client.get(url,
                                       headers=default_headers,
                                       timeout=10)
            except Exception as e:
                print(e.__class__.__name__, e, url)
                return
            print(f"Fetched {url}")
        dom = fromstring(res.content)
        if not res.status_code == 200:
            return
        stub = {
            "page":
            name,
            "url":
            url,
            "type":
            pagetype,
            "parent_url":
            parent,
            "page_hrefs": [
                f"https://en.wikipedia.org{link}" for link in dom.xpath(
                    "//div[contains(@id,'mw-content-text')]//a/@href")
            ],
            "page_links": [
                link.attrib["title"] for link in dom.xpath(
                    "//div[contains(@id,'mw-content-text')]//a")
                if link and hasattr(link, "attrib") and "title" in link.attrib
            ],
            "response":
            res.status_code,
            "ok":
            res.status_code == 200,
            "length":
            len(res.content),
            "category_hrefs": [
                f"https://en.wikipedia.org{link}" for link in dom.xpath(
                    "//div[contains(@id,'catlinks')]//a/@href")
            ],
            "category_links": [
                link.attrib["title"]
                for link in dom.xpath("//div[contains(@id,'catlinks')]//a")
                if link and hasattr(link, "attrib") and "title" in link.attrib
            ],
            "latitude":
            None,
            "longitude":
            None,
        }
        for colname, sel in xpath_selectors.items():
            values = []
            if colname in ("latitude", "longitude"):
                result = [node.text_content() for node in dom.xpath(sel)]
                if result:
                    result = result[0]
                    stub[colname] = result

        updates.append(stub)

        for url in stub["category_hrefs"] + stub["page_hrefs"]:
            # print(green(url))
            if url and url in seen:
                dups[url].append(name)
                # print(f"Page {url} is duplicated on: {len(dups[url])} pages")
            elif url and url not in seen and "Talk" not in url:
                name = url.split("/wiki/")[-1].replace("_", " ")
                queue.append((name, url, "link", parent))

    queue = deque(queue)

    while queue:
        async with trio.open_nursery() as nursery:
            for i in range(limit):
                next_url = None
                while next_url is None:
                    page_name, _next, page_type, parent = queue.popleft()

                    if (_next not in seen and "User" not in _next
                            and "Talk" not in _next):
                        seen.add(_next)
                        next_url = _next
                nursery.start_soon(fetch, page_name, next_url, page_type,
                                   parent)
        if len(updates) and len(updates) > 500:
            print(f"Updating database...")
            crawldb.upsert_many([{k: v
                                  for k, v in update.items()}
                                 for update in updates], ["url"])
            for item in updates:
                print(green(json.dumps(item, indent=4)))
            print(f"Inserted {len(updates)} items.")
            updates = []