def cli(): p = argparse.ArgumentParser() p.add_argument("blogpath", help="directory of Markdown posts to analyze") p.add_argument("xlsfile", help="path of .xlsx file to write") p.add_argument("--only", help="part of page to analyze", choices=["title", "description"]) p.add_argument("-ext", help="filename suffix", default=".md") P = p.parse_args() blog_path = Path(P.blogpath).expanduser() xlsx = Path(P.xlsfile).expanduser() xlsx.parent.mkdir(parents=True, exist_ok=True) if blog_path.is_file(): files = [blog_path] elif blog_path.is_dir(): files = list(blog_path.rglob(f"*{P.ext}")) else: raise NotADirectoryError(blog_path) cols = ["pos", "neu", "neg", "compound"] if P.only: cols.append(P.only) dat = pandas.DataFrame(index=[f.stem for f in files], columns=cols) now = datetime.datetime.now() for i, file in enumerate(files): print(f"{i+1} / {len(files)} {file.stem:<80}", end="\r") header = hugoutils.get_header(file)[0] if header is not None and "expiryDate" in header: if datetime.datetime.strptime(header["expiryDate"][:10], "%Y-%m-%d") < now: print("skip", file) continue if P.only: try: text = header[P.only] except TypeError: continue except KeyError: logging.error(f"{file.stem} does not have {P.only}") continue else: text = file.read_text(errors="ignore") s = analyze_post(text) if P.only: dat.loc[file.stem] = [s["pos"], s["neu"], s["neg"], s["compound"], text] else: dat.loc[file.stem] = [s["pos"], s["neu"], s["neg"], s["compound"]] if blog_path.is_file(): print(dat) else: dat.to_excel(xlsx)
def get_tags(path: Path, taxonomy_type: str) -> set[str]: files = list(path.glob("*.md")) dat: set[str] = set() for f in files: header = hugoutils.get_header(f)[0] try: tags = header[taxonomy_type] except (TypeError, KeyError): continue for tag in tags: dat.add(tag) return dat
p.add_argument("path", help="path to read Markdown blog files") p.add_argument("xlsx", help="excel filename to write") p.add_argument("-ext", help="filename suffix", default=".md") p = p.parse_args() inpath = Path(p.path).expanduser() if not inpath.is_dir(): raise NotADirectoryError(inpath) xlsx = Path(p.xlsx).expanduser() files = list(inpath.rglob(f"*{p.ext}")) dat: dict[str, int] = {} for f in files: header = hugoutils.get_header(f)[0] try: tags = header["tags"] except (TypeError, KeyError): continue except Exception as e: logging.error(f"{e}: {f.stem}") for tag in tags: try: dat[tag] += 1 except KeyError: dat[tag] = 1 pandas.DataFrame(index=dat.keys(), data=dat.values(), columns=["count"]).to_excel(xlsx)