def extract_timestamps_for_db(db_handler: DBHandler, terms: Sequence[str]): term_to_timestamp = {} for term in tqdm(terms, desc=db_handler.table_name): timestamp_counter = Counter() if db_handler.table_name == 'darkode': comments, timestamps = db_handler.fetch_comments(term) count_occurrences(timestamps, comments, term, timestamp_counter) else: comments, _ = db_handler.fetch_comments(term) for comment in comments: timestamps, texts = parse_comments(comment, db_handler.table_name) count_occurrences(timestamps, texts, term, timestamp_counter) term_to_timestamp[term] = pd.Series(list(timestamp_counter.values()), list(timestamp_counter.keys())) return term_to_timestamp
def _fetch_word_windows(dark_terms: List[str], hdlr: DBHandler, window_size: int) -> Union[List[WordWindow], list]: word_windows = [] for dark_term in tqdm(dark_terms, desc='dark_terms'): result, _ = hdlr.fetch_comments(dark_term) for comments in result: for comment in sent_tokenize(comments): ww = word_window(clean_str(comment), dark_term, size=window_size // 2) if ww: left, right = ww word_windows.append(WordWindow(dark_term, left, right)) return word_windows