def preprocess_csv(csvfilename, tweet_min_length, user_min_tweets,
                   remove_duplicates):
    cl.progress('Preprocessing file: %s' % csvfilename)
    preprocessor = TWLDAPreprocessor()
    grouped_tweets = collections.defaultdict(list)
    grouped_tweets_source = collections.defaultdict(list)

    for row in csv_reader(csvfilename):
        user = row['user']
        result = preprocessor.preprocess(row['text'])

        if len(result) >= tweet_min_length:
            result = ' '.join(result)

            if remove_duplicates and result in grouped_tweets[user]:
                continue

            grouped_tweets[user].append(result)
            grouped_tweets_source[user].append(row['text'].strip())

    grouped_tweets = {
        u: t
        for u, t in grouped_tweets.items() if len(t) >= user_min_tweets
    }
    return grouped_tweets, grouped_tweets_source
Ejemplo n.º 2
0
def recover_from_csv(csvfilename):
    progress = 0

    for row in csv_reader(csvfilename):
        progress += 1

        if progress % 1000 == 0:
            cl.progress('%d record(s) have been recovered' % progress)

        yield row

        if int(row['retweets']):
            try:
                retweets = twapi.GetRetweets(int(row['id']), count=100)
            except Exception:
                cl.warning('Error: %s' % get_exc_line())
            else:
                for tweet in retweets:
                    yield {
                        'id': tweet.id_str,
                        'text': row['text'],
                        'timestamp': tweet.created_at,
                        'likes': tweet.favorite_count,
                        'retweets': tweet.retweet_count,
                        'replies': None,
                        'url': None,
                        'html': None,
                        'user': merge_whitespaces(tweet.user.screen_name),
                        'fullname': merge_whitespaces(tweet.user.name)
                    }
Ejemplo n.º 3
0
def demo2():
    cl.section('Demo 2')

    username = ''
    while not username:
        username = cl.input('Username: '******''
    while not password:
        password = cl.password('Password: '******'Successfully logged in.')

    with cl.progress('Checking for update...', mode=cl.PROGRESS_SPIN):
        time.sleep(3)

    choice = ''
    while choice.lower() not in {'y', 'n'}:
        choice = cl.question(
            'A new version is present, would you like to update? (Y/N)').strip(
            )

    if choice.lower() == 'y':
        with cl.progress('Downloading ', mode=cl.PROGRESS_DETERMINATE) as p:
            time.sleep(1)
            p.update(0.2, ' 20% (1MB/5MB) ETA 4s')
            time.sleep(2)
            p.update(0.4, ' 40% (2MB/5MB) ETA 3s')

        cl.error('Failed to download package. SSL handshake error.')
    else:
        cl.warning('Update delayed!')
Ejemplo n.º 4
0
def export_csv(data, outfilename, encoding='utf-8'):
    cl.progress('Exporting data to csv file: %s' % outfilename)

    it = iter(data)
    num_records = 0

    try:
        first_item = next(it)
    except StopIteration:
        cl.warning('Empty data. Export aborted.')
        return
    else:
        num_records += 1

    with open(outfilename, 'w', newline='', encoding=encoding) as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=first_item.keys())
        writer.writeheader()
        writer.writerow(first_item)

        try:
            for item in it:
                num_records += 1
                writer.writerow(item)
        except KeyboardInterrupt:
            cl.warning('User hit Ctrl-C, flushing data...')

    cl.success('%d record(s) saved to csv file.' % num_records)
Ejemplo n.º 5
0
def demo1():
    cl.section('Demo 1')

    cl.info('Test program started.')

    with cl.progress('Running test case 1...', cl.PROGRESS_SPIN, erase=True):
        time.sleep(3)
    cl.success('Test case 1: Passed')

    with cl.progress('Running test case 2...', cl.PROGRESS_SPIN, erase=True):
        time.sleep(3)
    cl.success('Test case 2: Passed')

    with cl.progress('Running test case 3...', cl.PROGRESS_SPIN, erase=True):
        time.sleep(3)
    cl.success('Test case 3: Passed')

    with cl.progress('Running test case 4...', cl.PROGRESS_SPIN, erase=True):
        time.sleep(3)
    cl.error('Test case 4: Failed')

    cl.info('Input: 1111')
    cl.info('Expected output: 2222')
    cl.info('Got: 3333')

    cl.section('Test Result')
    cl.info('3 out of 4 test cases passed.')
    cl.info('Pass rate: 75%')
Ejemplo n.º 6
0
def run_linter(linter: Linter) -> bool:
    linter_name = linter['name']
    cl.progress('Running linter {}'.format(linter_name))
    result = subprocess.call(linter['command'])  # nosec
    if result == 0:
        cl.success('Linter {} success'.format(linter_name))
        return True
    cl.error('Linter {} failed'.format(linter_name))
    return False
Ejemplo n.º 7
0
def overview():
    cl.section('Overview of Labels')
    cl.success('Good job! All test cases passed!')
    cl.warning('Warning! Security update delayed!')
    cl.error('Error! Failed to write file!')
    cl.info('Server listening on port 8888.')
    cl.progress('Downloading package, please wait...')
    cl.plain('Nothing interesting.')
    cl.question('A new version is present, would you like to update? (Y/N)')
def preprocess_csv(csvfilename):
    cl.progress('Preprocessing file: %s' % csvfilename)

    grouped_tweets = collections.defaultdict(list)

    for row in csv_reader(csvfilename):
        grouped_tweets[row['user']].append(row['text'])

    for user in grouped_tweets:
        yield {'id': user, 'text': '  '.join(grouped_tweets[user])}
Ejemplo n.º 9
0
def win():
    cl.success('Congratulations! You solved all the challenges!')
    cl.info(
        "Now here is a gift for you. You can choose a callback function to call "
        "(e.g. try 'rainbow_fart')! Hope you can find the final flag through this!"
    )
    callback = get_callback()
    cl.progress(f'Executing callback {callback!r} for you...')
    exec(f'{callback}()')  # pylint: disable=exec-used # nosec
    bye()
Ejemplo n.º 10
0
def show_level_stats(level_status, items_per_row=5):
    cl.newline()
    cl.section('Level stats:')
    field_width = len(str(NUM_LEVELS))
    rows = math.ceil(NUM_LEVELS / items_per_row)
    for row in range(rows):
        cl.info(' '.join(
            show_level_block(x + 1, field_width, level_status[x])
            for x in range(row * items_per_row,
                           min((row + 1) * items_per_row, NUM_LEVELS))))
    cl.progress(f'Your progress: {sum(level_status)}/{NUM_LEVELS}')
    check_milestones(level_status)
    cl.newline()
def preprocess_csv(csvfilename,
                   *,
                   preprocessor_cls=TextPreprocessor,
                   custom_stop_words=None,
                   lem_ignore_patterns=None):
    cl.progress('Preprocessing file: %s' % csvfilename)

    preprocessor = preprocessor_cls(custom_stop_words=custom_stop_words,
                                    lem_ignore_patterns=lem_ignore_patterns)

    for row in csv_reader(csvfilename):
        result = preprocessor.preprocess(row['text'])

        if result:
            yield row['id'], result
Ejemplo n.º 12
0
def animations():
    cl.section('Progress Animations')

    cl.item('Static')
    cl.progress('Downloading...')
    time.sleep(3)

    cl.item('Spin')
    with cl.progress('Downloading...', mode=cl.PROGRESS_SPIN):
        time.sleep(3)

    cl.item('Expand')
    with cl.progress('Downloading', mode=cl.PROGRESS_EXPAND):
        time.sleep(6)

    cl.item('Move')
    with cl.progress('Downloading ', mode=cl.PROGRESS_MOVE):
        time.sleep(4)

    cl.item('Determinate')
    with cl.progress('Downloading ', mode=cl.PROGRESS_DETERMINATE) as p:
        time.sleep(1)
        p.update(0.2, ' 20% (1MB/5MB) ETA 4s')
        time.sleep(1)
        p.update(0.4, ' 40% (2MB/5MB) ETA 3s')
        time.sleep(1)
        p.update(0.6, ' 60% (3MB/5MB) ETA 2s')
        time.sleep(1)
        p.update(0.8, ' 80% (4MB/5MB) ETA 1s')
        time.sleep(1)
        p.update(1, ' 100% (5MB/5MB)')
    def wrapper(*args, **kwargs):
        p = cl.progress(f"Running test case '{func.__name__}'...",
                        cl.PROGRESS_SPIN,
                        erase=True)

        try:
            func(*args, **kwargs)
        except:
            p.stop()
            cl.error(f"Test case '{func.__name__}' failed.")
            raise
        else:
            p.stop()
            cl.success(f"Test case '{func.__name__}' passed.")
Ejemplo n.º 14
0
def lda_topic_model(input_filename, keyword, size, *, num_topics,
                    iterations=50, passes=1, chunksize=2000, eval_every=10,
                    verbose=False, gamma_threshold=0.001, filter_no_below=5,
                    filter_no_above=0.5, filter_keep_n=100000,
                    open_browser=True):
    cl.section('LDA Topic Model Training')
    cl.info('Keyword: %s' % keyword)
    cl.info('Data size: %d' % size)
    cl.info('Number of topics: %d' % num_topics)
    cl.info('Iterations: %d' % iterations)
    cl.info('Passes: %d' % passes)
    cl.info('Chunk size: %d' % chunksize)
    cl.info('Eval every: %s' % eval_every)
    cl.info('Verbose: %s' % verbose)
    cl.info('Gamma Threshold: %f' % gamma_threshold)
    cl.info('Filter no below: %d' % filter_no_below)
    cl.info('Filter no above: %f' % filter_no_above)
    cl.info('Filter keep n: %d' % filter_keep_n)

    assert re.fullmatch(r'[-_0-9a-zA-Z+]+', keyword)

    input_filename = data_source_file(input_filename)
    description = '%s-%d-%d-%dx%d-%s' % (keyword, size, num_topics, iterations,
                                         passes, time.strftime('%Y%m%d%H%M%S'))

    if verbose:
        log_filename = log_file('ldalog-%s.log' % description)
        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                            level=logging.DEBUG, filename=log_filename)
        cl.info('Writing logs into file: %s' % log_filename)

    with TimeMeasure('load_preprocessed_text'):
        preprocessed_texts = file_read_json(input_filename)
        preprocessed_texts = [item[1] for item in preprocessed_texts]

    with TimeMeasure('gen_dict_corpus'):
        cl.progress('Generating dictionary and corpus...')

        dictionary = Dictionary(preprocessed_texts, prune_at=None)
        dictionary.filter_extremes(no_below=filter_no_below,
                                   no_above=filter_no_above,
                                   keep_n=filter_keep_n)
        dictionary.compactify()

        corpus = [dictionary.doc2bow(text) for text in preprocessed_texts]

        corpusfilename = model_file('ldacorpus-%s.json' % description)
        file_write_json(corpusfilename, corpus)
        cl.success('Corpus saved as: %s' % corpusfilename)

    with TimeMeasure('training'):
        cl.progress('Performing training...')

        with NoConsoleOutput():
            ldamodel = LdaMulticore(corpus, workers=N_WORKERS,
                                    id2word=dictionary, num_topics=num_topics,
                                    iterations=iterations, passes=passes,
                                    chunksize=chunksize, eval_every=eval_every,
                                    gamma_threshold=gamma_threshold,
                                    alpha='symmetric', eta='auto')

        cl.success('Training finished.')

    with TimeMeasure('save_model'):
        modelfilename = 'ldamodel-%s' % description
        ldamodel.save(model_file(modelfilename))
        cl.success('Model saved as: %s' % modelfilename)

    with TimeMeasure('measure_coherence'):
        cl.progress('Measuring topic coherence...')
        measure_coherence(ldamodel, preprocessed_texts, corpus, dictionary)

    with TimeMeasure('vis_save'):
        cl.progress('Preparing visualization...')
        vis = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)
        htmlfilename = 'ldavis-%s.html' % description
        htmlfilename = report_file(htmlfilename)
        pyLDAvis.save_html(vis, htmlfilename)
        cl.success('Visualized result saved in file: %s' % htmlfilename)

    if open_browser:
        open_html_in_browser(htmlfilename)