Esempio n. 1
0
def extract_info_from_html(html_content: str):
    soup = BeautifulSoup(html_content, "html.parser")
    speech_content_div_tag = soup.find("div", class_="field-docs-content")

    paragraphs = []
    for p_tag in speech_content_div_tag.contents:
        if type(p_tag) == Tag:
            _string = p_tag.string
            if _string:
                paragraphs.append(_string.strip())

    full_text = " ".join(paragraphs)
    sentences = break_text_into_sentences(full_text)

    president_name_div_tag = soup.find("div", class_="field-title")
    president_name = None

    for i in president_name_div_tag.contents:
        if type(i) == Tag:
            president_name = i.contents[0].text

    span_tag = soup.find("span", class_="presidential-ordinal-number")
    president_seq = span_tag.text

    speech_year_span_tag = soup.find("span", class_="date-display-single")
    speech_year = speech_year_span_tag.text.strip()
    speech_year = speech_year[-4:]

    logger.info(
        f"{president_seq} President of the United States: {president_name}")

    return speech_year, president_name, sentences
Esempio n. 2
0
def get_video_metadata(video_path: str) -> dict:

    # check if the file exist
    video_path = Path(video_path)
    if not video_path.is_file():
        logger.error(f"Invalid video_path: `{video_path}` does not exist.")
        raise Exception("Invalid video_path: file does not exist.")

    # check if it is a video file
    known_video_formats = (".mp4", ".flv", ".mov", ".avi", ".wmv", ".mkv")
    video_path_obs = video_path.resolve()
    head, tail = os.path.split(video_path_obs)
    name, ext = os.path.splitext(tail)
    if ext not in known_video_formats:
        logger.warning(
            f"Invalid video_path: `{tail}` is not a known video format.")
        raise Exception(
            f"Invalid video_path: `{tail}` is not a known video format.")

    command_template = "ffprobe -v error -select_streams v:0 -show_entries stream=width,height,avg_frame_rate,duration -of json"
    args = shlex.split(command_template)
    args.append(str(video_path))
    proc = subprocess.Popen(args, stdout=subprocess.PIPE)
    out: bytes = proc.communicate()[0]
    json_string: str = out.decode("utf-8").strip()

    # logger.debug(json_string)

    json_obj: dict = json.loads(json_string)

    streams: list = json_obj.get("streams", [])
    if len(streams) == 1:
        _data = streams[0]
    elif len(streams) == 0:
        raise Exception()
    else:
        _data: dict = streams[0]
        logger.info(f"More than one stream is found at {video_path}")

    width: int = _data.get("width")
    height: int = _data.get("height")
    ratio = width / height
    avg_frame_rate: str = _data.get("avg_frame_rate")
    frame_rate: int = round(eval(avg_frame_rate)) if avg_frame_rate else None
    duration: float = round(float(_data.get("duration")), 2)

    video_metadata: dict = {
        "filepath": str(video_path_obs),
        "filename": name,
        "ext": ext,
        "width": width,
        "height": height,
        "ratio": ratio,  # width / height
        "duration": duration,  # in number of seconds
        "fps": frame_rate,  # frame per seconds
        "avg_frame_rate": avg_frame_rate,
    }

    # logger.debug(json.dumps(video_metadata, indent=4))
    return video_metadata
def parse_single_event(event_lines: List[str]) -> Dict[str, str]:
    if event_lines[0] == "BEGIN:VEVENT" and event_lines[-1] == "END:VEVENT":
        event_lines = event_lines[1:-1]
    else:
        logger.error("Invalid Event")
        return

    event = OrderedDict({
        "SUMMARY": None,
        "DESCRIPTION": None,
        "LOCATION": None,
        "DTSTART": None,
        "DTEND": None,
        "UID": None,
        "RRULE": None,
        "EXDATE": None,
    })

    for line in event_lines:
        colon_index = line.find(":")
        key = line[:colon_index]
        if key not in event:
            continue
        line_value = line[colon_index + 1:]
        # timezone fix
        if key == "DTSTART" or key == "DTEND" or key == "EXDATE":
            event[key + ";TZID=Asia/Singapore"] = line_value
        elif key == "RRULE":
            # TODO: convert UNTIL field to GMT (add Z ad the back as well)
            # see: https://www.kanzaki.com/docs/ical/recur.html
            event[key] = line_value
        else:
            event[key] = line_value
    # correct summary
    original_summary = event.get("SUMMARY")
    logger.info(original_summary)
    if original_summary:
        room_index = original_summary.find("ROOM:")
        if room_index != -1:
            location_value = original_summary[room_index + 5:]
            event["LOCATION"] = location_value.strip()
            original_summary = original_summary[:room_index]

        chop_index = original_summary.find(":")
        if chop_index != -1:
            chopped = original_summary[:chop_index - 2]
            event["SUMMARY"] = chopped.strip()

    if event.get("SUMMARY") == event.get("DESCRIPTION"):
        event.pop("DESCRIPTION")

    unused = []
    for key in event.keys():
        if not event.get(key):
            unused.append(key)

    for key in unused:
        event.pop(key)

    return event
Esempio n. 4
0
def bump_version(filepath: Path) -> Path:
    head, tail = os.path.split(filepath)
    name, ext = os.path.splitext(tail)
    suffix = 1
    while filepath.is_file():
        logger.info(f"{filepath} already exist")
        suffix += 1
        new_name = f"{name}_v{str(suffix)}{ext}"
        filepath = Path(head) / new_name
    return filepath
Esempio n. 5
0
def main():
    """Start the bot"""

    logger.info("Getting bot_token from environment")
    bot_token = os.environ.get("BOT_TOKEN", None)

    if bot_token == "REPLACE_ME" or bot_token is None:

        logger.info("Getting bot_token from src/config.conf")
        config_fp = curr_folder / "config.conf"

        if not config_fp.is_file():
            logger.error("bot_token not found: No Config File is Found.")
            return

        with config_fp.open() as f:
            config = json.load(f)
        bot_token = config.get("bot_token", None)

    if bot_token == "REPLACE_ME" or bot_token is None:
        logger.error(
            "bot_token not found: Failed getting bot token from environment and 'config.conf'"
        )
        return

    load_stats()

    # Create the Updater and pass it your bot's token.
    # Make sure to set use_context=True to use the new context based callbacks
    # Post version 12 this will no longer be necessary
    updater = Updater(bot_token, use_context=True)
    # Get the dispatcher to register handlers
    dispatcher = updater.dispatcher

    # Command Handlers
    dispatcher.add_handler(CommandHandler("start", start))
    dispatcher.add_handler(CommandHandler("help", help))
    dispatcher.add_handler(CommandHandler("stats", stats))
    dispatcher.add_handler(CommandHandler("friends", friends))
    dispatcher.add_handler(CommandHandler("source", source))

    # Message Handlers
    dispatcher.add_handler(MessageHandler(Filters.text, echo))
    dispatcher.add_handler(MessageHandler(Filters.document, ics))

    # log all errors
    dispatcher.add_error_handler(error)

    # Start the Bot
    updater.start_polling()
    updater.idle()
Esempio n. 6
0
def main():
    with open("links.txt") as f:
        links = f.readlines()

    for link in links:
        page_url = link.strip()
        r = requests.get(page_url)
        if r.status_code == 200:
            logger.info(page_url)
            html_content = r.text
            speech_year, president_name, sentences = extract_info_from_html(
                html_content)
            logger.info(speech_year)
            logger.info(president_name)
            export_path = (Path("Inaugural_Addresses") /
                           f"{speech_year} {president_name}.txt")
            with export_path.open("w") as f:
                f.write("\n".join(sentences))
            logger.info(f"Exported: {export_path}")

        else:
            logger.error(page_url)
            logger.error("Unexpected response with non-200 error code")
Esempio n. 7
0
def migrate_from_unlabelled_to_local():
    base_path = Path("/home/UROP/data_urop/unlabelled")
    destination_folder = Path("/home/UROP/data_urop/all_videos_local")
    assert base_path.is_dir()
    assert destination_folder.is_dir()
    migration_list = []

    for video in os.listdir(base_path):
        video_path = base_path / video
        if not video_path.is_file():
            logger.info(f"Unexpected")
            return

        if str(video_path)[-4:] != ".mp4":
            logger.info(f"Skip {video_path}")
            continue

        target_path = destination_folder / video
        if target_path.is_file():
            logger.info(f"{target_path} already exist")
        else:
            logger.debug(f"{video_path} -> {target_path}")
            migration_list.append([str(video_path), str(target_path)])

    logger.debug(f"Number of file to copy: {len(migration_list)}")
    proceed = input("Proceed? (y/n)")
    if proceed != "y":
        logger.warning("Abort")
        return
    logger.debug(json.dumps(migration_list, indent=4))
    proceed = input("Proceed? (y/n)")
    if proceed != "y":
        logger.warning("Abort")
        return
    pool = multiprocessing.Pool()
    result = pool.map(call_safe_copy, migration_list)
Esempio n. 8
0
def migrate_from_drive_to_local():
    base_path = Path(
        "/home/UROP/shared_drive/Video_Folders/Trimmed_All_Videos")
    destination_folder = Path("/home/UROP/data_urop/all_videos_local")
    assert base_path.is_dir()
    assert destination_folder.is_dir()
    migration_list = []

    for folder in os.listdir(base_path):
        folder_path = base_path / folder
        if not folder_path.is_dir():
            logger.info(f"Skip {folder_path}")
            continue

        logger.debug(folder)

        if folder.startswith("bilibili_"):
            # folder_num = int(folder[-3:])
            # if folder_num <= 80:
            #     continue

            folder_path = base_path / folder
            assert folder_path.is_dir()

            for file in os.listdir(folder_path):
                if file[-4:] != ".mp4":
                    logger.info(f"Skip {file}")
                    continue
                new_name = "b_" + file
                video_filepath: Path = folder_path / file
                dst_path: Path = destination_folder / new_name
                logger.debug(f"{video_filepath} -> {dst_path}")
                migration_list.append([str(video_filepath), str(dst_path)])

        elif folder.startswith("youtube_"):
            # folder_num = int(folder[-3:])
            # if folder_num <= 10:
            #     continue

            folder_path = base_path / folder
            assert folder_path.is_dir()

            for file in os.listdir(folder_path):
                if file[-4:] != ".mp4":
                    logger.info(f"Skip {file}")
                    continue
                new_name = "y_" + file
                video_filepath: Path = folder_path / file
                dst_path: Path = destination_folder / new_name
                logger.debug(f"{video_filepath} -> {dst_path}")
                migration_list.append([str(video_filepath), str(dst_path)])

        elif folder.endswith("yutian"):
            folder_path = base_path / folder
            assert folder_path.is_dir()

            for file in os.listdir(folder_path):
                if file[-4:] != ".mp4":
                    logger.info(f"Skip {file}")
                    continue
                new_name = ""
                for i in file.lower():
                    if i == "(":
                        new_name += "_"
                    elif i == ")":
                        pass
                    else:
                        new_name += i
                new_name = "c_" + new_name
                video_filepath: Path = folder_path / file
                dst_path: Path = destination_folder / new_name
                logger.debug(f"{video_filepath} -> {dst_path}")
                migration_list.append([str(video_filepath), str(dst_path)])

    logger.debug(f"Number of file to copy: {len(migration_list)}")
    proceed = input("Proceed? (y/n)")
    if proceed != "y":
        logger.warning("Abort")
        return
    logger.debug(json.dumps(migration_list, indent=4))
    proceed = input("Proceed? (y/n)")
    if proceed != "y":
        logger.warning("Abort")
        return
    pool = multiprocessing.Pool()
    result = pool.map(call_safe_copy, migration_list)
Esempio n. 9
0
try:
    if _USER.endswith("ME"):
        from .db_secrete import _DB_NAME, _DB_PASS, _DB_USER
    else:
        _DB_USER = _USER
        _DB_PASS = _PASS
        _DB_NAME = _NAME
except ImportError:
    logger.error("No Database Config Found.")

logger.info(
    f"""-----------------------------------
            MongoDB config:
            
            User: {_DB_USER}
            Database Name: {_DB_NAME}
         -----------------------------------
"""
)


client = pymongo.MongoClient(
    f"mongodb+srv://{_DB_USER}:{_DB_PASS}@clusteresc.xvunj.mongodb.net/{_DB_NAME}?retryWrites=true&w=majority",
    ssl=True,
)

db = client[f"{_DB_NAME}"]
db_available = False

try:
Esempio n. 10
0
        logger.debug(f"Exported: {export_path}")


if __name__ == "__main__":
    src_folder = Path(__file__).parent / "Inaugural_Addresses"
    result_folder = Path(__file__).parent / "results"

    word_occurrence_result = {}

    for filename in os.listdir(src_folder):
        filepath = src_folder / filename
        assert filepath.is_file()
        logger.debug(filename)
        with filepath.open() as f:
            lines = f.readlines()
            logger.info(f"Number of sentences: {len(lines)}")

            words = get_word_list_from_lines(lines)
            logger.info(f"Number of words: {len(words)}")

            word_occurence_map = get_word_occurence_map(words)
            logger.info(
                f"Number of unique words: {len(word_occurence_map.keys())}")
            # filter stopwords, remove words that once appeared once
            word_occurence_map = {
                k: v
                for k, v in sorted(word_occurence_map.items(),
                                   key=lambda x: x[1],
                                   reverse=True) if v > 1 and k in PRONOUNS
            }
            word_occurrence_result[filename[:-4]] = word_occurence_map
Esempio n. 11
0
origins = ["http://localhost", "http://*****:*****@app.get("/api")
async def index():
    return {"Hello": "SUTD Housing Portal"}
Esempio n. 12
0
def validate_new_application(target_AP_uid: str, student_id: str,
                             stay_period: TimePeriod) -> bool:
    """
    This method does the following checks to ensure incoming Application Form is valid.

    1. Check if target ApplicationPeriod exists in DB.
    2. Check if current time fall in between the valid application window.
    3. Check if current student is among the eligible students in the particular ApplicationPeriod.
    4. Check if current student has already submitted an application for this particular ApplicationPeriod.
    5. Check if the stay period specified by the student is valid options in this particular ApplicationPeriod.
    """
    _now = datetime.now()
    try:
        ap_dict = application_periods_collection.find_one(
            {"uid": target_AP_uid})
        clean_dict(ap_dict)
    except Exception as e:
        logger.error(MSG.DB_QUERY_ERROR)
        logger.error(e)
        raise HTTPException(status_code=500, detail=MSG.DB_QUERY_ERROR)

    if not ap_dict:
        logger.info(f"ApplicationPeriod '{target_AP_uid}' Not Found")
        return False

    window_open_dt = ap_dict.get("application_window_open")
    window_close_dt = ap_dict.get("application_window_close")
    if not window_open_dt <= _now <= window_close_dt:
        logger.info(
            f"Student({student_id}) attempted submitting application to ApplicationPeriod({target_AP_uid})"
        )
        logger.info(f"Failed. Not in Application Window: {target_AP_uid}")
        return False

    application_forms_map: Dict[str,
                                str] = ap_dict.get("application_forms_map")
    if student_id not in application_forms_map:
        logger.info(
            f"Ineligible Student({student_id}) attempted submitting application to ApplicationPeriod({target_AP_uid})"
        )
        # NOTE: this restriction is temporarily relaxed
        logger.info(f"Restriction temporarily relaxed")
        # return False
    if application_forms_map.get(student_id) != "":
        logger.info(
            f"Illegal second submission by Student({student_id}) to ApplicationPeriod({target_AP_uid})"
        )
        # NOTE: this restriction is temporarily relaxed
        logger.info(f"Restriction temporarily relaxed")
        # return False

    u_start_date = stay_period.start_date
    u_end_date = stay_period.end_date
    applicable_periods: List[Dict[str, datetime]] = ap_dict.get(
        "applicable_periods")
    period_matched = False
    for _period in applicable_periods:
        start_date = convert_datetime_to_date(_period.get("start_date"))
        end_date = convert_datetime_to_date(_period.get("end_date"))
        if u_start_date == start_date and u_end_date == end_date:
            logger.debug("Stay Period Matched!")
            period_matched = True
            break
    if not period_matched:
        logger.info(
            f"Illegal Stay Period by Student({student_id}) to ApplicationPeriod({target_AP_uid})"
        )
        return False

    return True
def parse_single_event(event_lines: List[str]) -> Dict[str, str]:
    if event_lines[0] == "BEGIN:VEVENT" and event_lines[-1] == "END:VEVENT":
        event_lines = event_lines[1:-1]
    else:
        logger.error("Invalid Event")
        return

    event = OrderedDict({
        "SUMMARY": None,  # Title
        "DESCRIPTION": None,  # Description
        "LOCATION": None,  # Location
        "DTSTART": None,  # Date Start
        "DTEND": None,  # Date End
        "UID": None,  # Unique Identifier
        "RRULE": None,  # Recurring Rule
        "EXDATE": None,  # Expiring Date
    })

    for line in event_lines:
        """ Each line looks like 'KEY:LINE_VALUE' """

        # find the index of the colon separating KEY and LINE_VALUE
        colon_index = line.find(":")

        # Get KEY literal
        key = line[:colon_index]

        if key not in event:
            # skip unnecessary keys
            continue

        # Get LINE_VALUE literal
        line_value = line[colon_index + 1:]

        # timezone fix
        if key == "DTSTART" or key == "DTEND" or key == "EXDATE":
            event[key + ";TZID=Asia/Singapore"] = line_value

        # recurring rule fix
        elif key == "RRULE":
            # Ref: https://www.kanzaki.com/docs/ical/recur.html
            # TODO: convert UNTIL field to GMT (add Z ad the back as well)
            # TODO: recurring rule validation
            event[key] = line_value

        else:
            event[key] = line_value

    # 'Summary' fix
    original_summary = event.get("SUMMARY")
    logger.info(original_summary)
    if original_summary:
        room_index = original_summary.find("ROOM:")
        if room_index != -1:
            location_value = original_summary[room_index + 5:]
            event["LOCATION"] = location_value.strip()
            original_summary = original_summary[:room_index]

        chop_index = original_summary.find(":")
        if chop_index != -1:
            chopped = original_summary[:chop_index - 2]
            event["SUMMARY"] = chopped.strip()

    if event.get("SUMMARY") == event.get("DESCRIPTION"):
        event.pop("DESCRIPTION")

    # get keys with 'None' value
    unused_keys = [key for key in event.keys() if not event.get(key)]
    # remove unused keys
    for key in unused_keys:
        event.pop(key)

    return event