Example #1
0
def extract_timestamps_for_db(db_handler: DBHandler, terms: Sequence[str]):
    term_to_timestamp = {}
    for term in tqdm(terms, desc=db_handler.table_name):
        timestamp_counter = Counter()
        if db_handler.table_name == 'darkode':
            comments, timestamps = db_handler.fetch_comments(term)
            count_occurrences(timestamps, comments, term, timestamp_counter)
        else:
            comments, _ = db_handler.fetch_comments(term)
            for comment in comments:
                timestamps, texts = parse_comments(comment,
                                                   db_handler.table_name)
                count_occurrences(timestamps, texts, term, timestamp_counter)
        term_to_timestamp[term] = pd.Series(list(timestamp_counter.values()),
                                            list(timestamp_counter.keys()))
    return term_to_timestamp
Example #2
0
def _fetch_word_windows(dark_terms: List[str], hdlr: DBHandler,
                        window_size: int) -> Union[List[WordWindow], list]:
    word_windows = []

    for dark_term in tqdm(dark_terms, desc='dark_terms'):
        result, _ = hdlr.fetch_comments(dark_term)
        for comments in result:
            for comment in sent_tokenize(comments):
                ww = word_window(clean_str(comment),
                                 dark_term,
                                 size=window_size // 2)
                if ww:
                    left, right = ww
                    word_windows.append(WordWindow(dark_term, left, right))

    return word_windows