Python info Examples, colorlabels.info Python Examples

Example #1

0

Show file

File: text_preprocessor.py Project: gousaiyang/topic-model-analysis

def text_preprocessor(input_filename,
                      *,
                      preprocessor_cls='TextPreprocessor',
                      custom_stop_words=None,
                      lem_ignore_patterns=None,
                      remove_duplicates=False):
    cl.section('Text Preprocessor')

    input_filename = data_source_file(input_filename)
    preprocessor_cls = globals()[preprocessor_cls]

    with TimeMeasure('preprocess_text'):
        result = preprocess_csv(input_filename,
                                preprocessor_cls=preprocessor_cls,
                                custom_stop_words=custom_stop_words,
                                lem_ignore_patterns=lem_ignore_patterns)

        if remove_duplicates:
            result = remove_duplicate_text(result)

        result = tuple(result)
        cl.info('Effective data size: %d' % len(result))

    with TimeMeasure('save_preprocessed'):
        save_preprocessed(result, input_filename)

Example #2

0

Show file

File: timeutil.py Project: gousaiyang/topic-model-analysis

 def wrapper(*args, **kwargs):
     t_start = timeit.default_timer()
     ret = func(*args, **kwargs)
     t_end = timeit.default_timer()
     time_str = pretty_time(t_end - t_start)
     cl.info("Function '%s' cost time: %s" % (func.__name__, time_str))
     return ret

Example #3

0

Show file

def twapi_search(query, count, *, sleep_time=0, **kwargs):
    with open(DEBUG_FILENAME, 'w'):
        pass

    last_id = None
    num_fetched = 0

    while count > 0:
        next_count = min(count, MAX_COUNT_PER_REQ)
        next_id = last_id - 1 if last_id is not None else None
        data = list(
            twapi_search_page(query,
                              next_count,
                              next_id,
                              sleep_time=sleep_time,
                              **kwargs))

        if not data:
            cl.warning('No more data can be retrieved, terminating...')
            return

        for item in data:
            last_id = int(item['id'])
            yield item

        num_fetched += len(data)
        count -= len(data)

        cl.info('Current number of records fetched: %d' % num_fetched)

Example #4

0

Show file

def retweets_recover(csvfilename):
    cl.section('Retweets Recover')
    cl.info('Recovering file: %s' % csvfilename)

    csvfilename = data_source_file(csvfilename)
    result = recover_from_csv(csvfilename)
    exportfilename = name_with_title_suffix(csvfilename, '-recovered')
    export_csv(result, exportfilename)
    return os.path.basename(exportfilename)

Example #5

0

Show file

def overview():
    cl.section('Overview of Labels')
    cl.success('Good job! All test cases passed!')
    cl.warning('Warning! Security update delayed!')
    cl.error('Error! Failed to write file!')
    cl.info('Server listening on port 8888.')
    cl.progress('Downloading package, please wait...')
    cl.plain('Nothing interesting.')
    cl.question('A new version is present, would you like to update? (Y/N)')

Example #6

0

Show file

File: model_analyzer.py Project: gousaiyang/topic-model-analysis

def model_analyzer(modeldesc, sourcedesc, *, num_top_words=30,
                   num_top_docs=30, debug=False):
    cl.section('LDA Model Analyzer')
    cl.info('Model description: %s' % modeldesc)
    cl.info('Source description: %s' % sourcedesc)

    with TimeMeasure('load_all'):
        ldamodel, corpus, prep_items, source_texts = load_all(modeldesc,
                                                              sourcedesc)

    with TimeMeasure('analyzing'):
        prep_ids = tuple(item[0] for item in prep_items)
        dictionary = ldamodel.id2word
        num_topics = ldamodel.num_topics
        topics = [{
                    'topic_id': i,
                    'words': get_topic_words(ldamodel, i, num_top_words),
                    'popularity': 0.0,
                    'documents': collections.defaultdict(float)
                } for i in range(num_topics)]

        if debug:
            debugfilename = model_file('ldadoctopics-%s.txt' % modeldesc)
            with open(debugfilename, 'w', encoding='utf-8') as debugfile:
                for index, doc in enumerate(corpus):
                    text_id = prep_ids[index]
                    doc_topics = ldamodel.get_document_topics(doc)
                    text = source_texts[text_id].strip()
                    debugfile.write('%s -> %r, %s\n' % (text_id, doc_topics,
                                                        text))

        term_topics_cache = {}

        for word in dictionary:
            term_topics_cache[word] = ldamodel.get_term_topics(word)

        for index, doc in enumerate(corpus):
            for topic_id, prob in ldamodel.get_document_topics(doc):
                topics[topic_id]['popularity'] += prob

            for word, freq in doc:
                if word not in dictionary:
                    continue

                for topic_id, prob in term_topics_cache[word]:
                    topics[topic_id]['documents'][index] += prob * freq

        for topic in topics:
            topic['documents'] = get_topic_top_docs(topic['documents'],
                                                    num_top_docs,
                                                    prep_ids, source_texts)

        topics = sorted(topics, key=lambda x: x['popularity'], reverse=True)

    with TimeMeasure('export_markdown'):
        export_markdown(modeldesc, sourcedesc, topics)

Example #7

0

Show file

def win():
    cl.success('Congratulations! You solved all the challenges!')
    cl.info(
        "Now here is a gift for you. You can choose a callback function to call "
        "(e.g. try 'rainbow_fart')! Hope you can find the final flag through this!"
    )
    callback = get_callback()
    cl.progress(f'Executing callback {callback!r} for you...')
    exec(f'{callback}()')  # pylint: disable=exec-used # nosec
    bye()

Example #8

0

Show file

File: topic_num_adjust.py Project: gousaiyang/topic-model-analysis

def twlda_multiple_run(num_topics_range,
                       iteration,
                       desc_prefix,
                       show_console_output=True):
    cl.section('Twitter-LDA Multiple Run')

    for topics in num_topics_range:
        cl.info('Running with %d topics' % topics)
        twitter_lda(output_desc='%s-%d' % (desc_prefix, topics),
                    topics=topics,
                    iteration=iteration,
                    show_console_output=show_console_output)

Example #9

0

Show file

def random_sampler(csvfilename, amount):
    cl.section('Data Random Sampler')
    cl.info('Random sampling file: %s' % csvfilename)
    cl.info('Amount: %d' % amount)

    csvfilename = data_source_file(csvfilename)
    data = list(csv_reader(csvfilename))

    random.shuffle(data)
    data = data[:amount]

    exportfilename = name_with_title_suffix(csvfilename, '-sample-%d' % amount)
    export_csv(data, exportfilename)

Example #10

0

Show file

def show_level_stats(level_status, items_per_row=5):
    cl.newline()
    cl.section('Level stats:')
    field_width = len(str(NUM_LEVELS))
    rows = math.ceil(NUM_LEVELS / items_per_row)
    for row in range(rows):
        cl.info(' '.join(
            show_level_block(x + 1, field_width, level_status[x])
            for x in range(row * items_per_row,
                           min((row + 1) * items_per_row, NUM_LEVELS))))
    cl.progress(f'Your progress: {sum(level_status)}/{NUM_LEVELS}')
    check_milestones(level_status)
    cl.newline()

Example #11

0

Show file

def main():
    cl.section('Welcome to Python Challenges')
    cl.info(f'Python version: {PYTHON_VERSION}')
    level_status = [False] * NUM_LEVELS
    while True:
        show_level_stats(level_status)

        cl.info(f'Enter a level number (1-{NUM_LEVELS}) to solve a level, '
                'or enter 0 to view source code')
        level_number = get_level_number()

        if level_number == 0:
            print(SOURCE, end='')
            continue

        if level_status[level_number - 1]:
            cl.success('You already solved this level')
            continue

        level_func = globals()[f'level_{level_number}']
        answer = get_input(f'Your answer for level {level_number}: ')

        timer = threading.Timer(CHALLENGE_TIMEOUT, die, args=('Timeout!', ))
        timer.start()

        try:
            global_check(answer)
            answer = ast.literal_eval(answer.strip())
        except Exception:  # pylint: disable=broad-except
            timer.cancel()
            cl.error('Wrong answer')
            if DEBUG_MODE:
                traceback.print_exc(file=sys.stdout)
            continue

        try:
            level_func(answer)
        except Exception:  # pylint: disable=broad-except
            timer.cancel()
            cl.error('Wrong answer')
            if DEBUG_MODE:
                traceback.print_exc(file=sys.stdout)
            continue

        timer.cancel()
        cl.success('Correct answer')
        level_status[level_number - 1] = True

Example #12

0

Show file

def twitter_lda(*, output_desc, topics, iteration, alpha_g=None,
                beta_word=0.01, beta_b=0.01, gamma=20,
                show_console_output=True):
    cl.section('Twitter-LDA Runner')
    cl.info('Output description: %s' % output_desc)

    assert re.fullmatch(r'[-_0-9a-zA-Z]+', output_desc)

    if alpha_g is None:
        alpha_g = 50 / topics

    set_parameters(topics, alpha_g, beta_word, beta_b, gamma, iteration)

    with TimeMeasure('Twitter-LDA training'):
        run_twlda(show_console_output=show_console_output)

    move_result(output_desc)

Example #13

0

Show file

def measure_coherence(model, texts, corpus, dictionary):
    cm = CoherenceModel(model=model, corpus=corpus, dictionary=dictionary,
                        coherence='u_mass')
    u_mass = cm.get_coherence()

    cm = CoherenceModel(model=model, texts=texts, dictionary=dictionary,
                        coherence='c_v')
    c_v = cm.get_coherence()

    cm = CoherenceModel(model=model, texts=texts, dictionary=dictionary,
                        coherence='c_uci')
    c_uci = cm.get_coherence()

    cm = CoherenceModel(model=model, texts=texts, dictionary=dictionary,
                        coherence='c_npmi')
    c_npmi = cm.get_coherence()

    cl.info('Topic coherence: u_mass = %f, c_v = %f, c_uci = %f, c_npmi = %f'
            % (u_mass, c_v, c_uci, c_npmi))

Example #14

0

Show file

File: train_task.py Project: gousaiyang/topic-model-analysis

def main():
    tag = time.strftime('java-%Y%m%d%H%M%S')
    tweets_file_recovered = 'twdata-java-recovered.csv'
    userinfo_file = 'twusers-java.csv'
    num_topics_range = list(range(MIN_TOPICS, MAX_TOPICS + 1))

    # Preprocess
    retry_until_success(text_preprocessor_twlda,
                        tweets_file_recovered[:-4],
                        tweet_min_length=2,
                        user_min_tweets=1,
                        remove_duplicates=True)

    # Train (with different number of topics)
    for topics in num_topics_range:
        cl.info('Running with %d topics' % topics)
        retry_until_success(twitter_lda,
                            output_desc='java-%d' % topics,
                            topics=topics,
                            iteration=ITERATIONS,
                            show_console_output=True)

    # Analyze (Perplexity Plot + HTML Reports + Compress)
    report_files = []
    plot_file, minima_points = plot_diff_topics(num_topics_range, 'java',
                                                r'Perplexity is ([\d.]+)',
                                                pipe_encoding)
    report_files.append(plot_file)
    report_points = minima_points if REPORT_ONLY_MINIMA else num_topics_range

    for topics in report_points:
        report_files.append(
            visualization_twlda(KEYWORD,
                                'java-%d' % topics,
                                '%s-%d' % (tag, topics),
                                userinfo_file,
                                open_browser=False))
    compress_report_files(tag, report_files)

Example #15

0

Show file

File: test_bookstore.py Project: gousaiyang/SE213-software-testing

        # Test filtering by category. Check whether result is in a correct format.
        Select(self.driver.find_element_by_id(
            'filterCategory')).select_by_index(1)
        self.driver.find_element_by_id('btnFilterCategory').click()
        self.wait.until(
            EC.visibility_of_element_located((By.ID, 'filterCategoryResult')))
        self.assertIn(
            '购买人数',
            self.driver.find_element_by_id('filterCategoryPerson').text)
        self.assertIn(
            '总销量',
            self.driver.find_element_by_id('filterCategoryQuantity').text)
        self.assertIn(
            '总金额',
            self.driver.find_element_by_id('filterCategoryPrice').text)

    @classmethod
    def tearDownClass(cls):
        '''Stop the web driver only once.
        Doing this repeatedly in `tearDown()` could be time-consuming.'''
        cls.driver.quit()


if __name__ == '__main__':
    # Intercept the output from unittest and display it in the end.
    testsuite = unittest.TestLoader().loadTestsFromTestCase(TestBookStore)
    with io.StringIO() as f:
        unittest.TextTestRunner(stream=f).run(testsuite)
        cl.info('Message from unittest:')
        print(f.getvalue(), end='')

Example #16

0

Show file

def demo1():
    cl.section('Demo 1')

    cl.info('Test program started.')

    with cl.progress('Running test case 1...', cl.PROGRESS_SPIN, erase=True):
        time.sleep(3)
    cl.success('Test case 1: Passed')

    with cl.progress('Running test case 2...', cl.PROGRESS_SPIN, erase=True):
        time.sleep(3)
    cl.success('Test case 2: Passed')

    with cl.progress('Running test case 3...', cl.PROGRESS_SPIN, erase=True):
        time.sleep(3)
    cl.success('Test case 3: Passed')

    with cl.progress('Running test case 4...', cl.PROGRESS_SPIN, erase=True):
        time.sleep(3)
    cl.error('Test case 4: Failed')

    cl.info('Input: 1111')
    cl.info('Expected output: 2222')
    cl.info('Got: 3333')

    cl.section('Test Result')
    cl.info('3 out of 4 test cases passed.')
    cl.info('Pass rate: 75%')

Example #17

0

Show file

def lda_topic_model(input_filename, keyword, size, *, num_topics,
                    iterations=50, passes=1, chunksize=2000, eval_every=10,
                    verbose=False, gamma_threshold=0.001, filter_no_below=5,
                    filter_no_above=0.5, filter_keep_n=100000,
                    open_browser=True):
    cl.section('LDA Topic Model Training')
    cl.info('Keyword: %s' % keyword)
    cl.info('Data size: %d' % size)
    cl.info('Number of topics: %d' % num_topics)
    cl.info('Iterations: %d' % iterations)
    cl.info('Passes: %d' % passes)
    cl.info('Chunk size: %d' % chunksize)
    cl.info('Eval every: %s' % eval_every)
    cl.info('Verbose: %s' % verbose)
    cl.info('Gamma Threshold: %f' % gamma_threshold)
    cl.info('Filter no below: %d' % filter_no_below)
    cl.info('Filter no above: %f' % filter_no_above)
    cl.info('Filter keep n: %d' % filter_keep_n)

    assert re.fullmatch(r'[-_0-9a-zA-Z+]+', keyword)

    input_filename = data_source_file(input_filename)
    description = '%s-%d-%d-%dx%d-%s' % (keyword, size, num_topics, iterations,
                                         passes, time.strftime('%Y%m%d%H%M%S'))

    if verbose:
        log_filename = log_file('ldalog-%s.log' % description)
        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                            level=logging.DEBUG, filename=log_filename)
        cl.info('Writing logs into file: %s' % log_filename)

    with TimeMeasure('load_preprocessed_text'):
        preprocessed_texts = file_read_json(input_filename)
        preprocessed_texts = [item[1] for item in preprocessed_texts]

    with TimeMeasure('gen_dict_corpus'):
        cl.progress('Generating dictionary and corpus...')

        dictionary = Dictionary(preprocessed_texts, prune_at=None)
        dictionary.filter_extremes(no_below=filter_no_below,
                                   no_above=filter_no_above,
                                   keep_n=filter_keep_n)
        dictionary.compactify()

        corpus = [dictionary.doc2bow(text) for text in preprocessed_texts]

        corpusfilename = model_file('ldacorpus-%s.json' % description)
        file_write_json(corpusfilename, corpus)
        cl.success('Corpus saved as: %s' % corpusfilename)

    with TimeMeasure('training'):
        cl.progress('Performing training...')

        with NoConsoleOutput():
            ldamodel = LdaMulticore(corpus, workers=N_WORKERS,
                                    id2word=dictionary, num_topics=num_topics,
                                    iterations=iterations, passes=passes,
                                    chunksize=chunksize, eval_every=eval_every,
                                    gamma_threshold=gamma_threshold,
                                    alpha='symmetric', eta='auto')

        cl.success('Training finished.')

    with TimeMeasure('save_model'):
        modelfilename = 'ldamodel-%s' % description
        ldamodel.save(model_file(modelfilename))
        cl.success('Model saved as: %s' % modelfilename)

    with TimeMeasure('measure_coherence'):
        cl.progress('Measuring topic coherence...')
        measure_coherence(ldamodel, preprocessed_texts, corpus, dictionary)

    with TimeMeasure('vis_save'):
        cl.progress('Preparing visualization...')
        vis = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)
        htmlfilename = 'ldavis-%s.html' % description
        htmlfilename = report_file(htmlfilename)
        pyLDAvis.save_html(vis, htmlfilename)
        cl.success('Visualized result saved in file: %s' % htmlfilename)

    if open_browser:
        open_html_in_browser(htmlfilename)

Example #18

0

Show file

File: timeutil.py Project: gousaiyang/topic-model-analysis

 def __exit__(self, type_, value, trace):
     self.t_end = timeit.default_timer()
     time_str = pretty_time(self.t_end - self.t_start)
     cl.info("Procedure '%s' cost time: %s" % (self.name, time_str))

Example #19

0

Show file

File: task.py Project: gousaiyang/topic-model-analysis

def training_task_process(task_id, user_id, tweets_filename, userinfo_filename,
                          tag, params):
    task = get_task_by_id(task_id)
    logfilename = 'tasklog-%d.log' % task_id

    try:
        logfile = open(log_file(logfilename), 'w', encoding='utf-8')
    except Exception:
        traceback.print_exc()
        update_task_status(task, code=STATUS_FAILED,
                           detail='Error: %s' % get_exc_line())
        return

    try:
        sys.stdout = logfile
        sys.stderr = logfile

        cl.config(color_span=0)
        cl.info('Task started, pid is %d' % os.getpid())

        update_task_status(task, code=STATUS_RUNNING, detail='Preprocessing')
        text_preprocessor_twlda(tweets_filename[:-4], tweet_min_length=2,
                                user_min_tweets=1, remove_duplicates=True)

        desc_prefix = '%d-%s' % (user_id, tag)
        num_topics_range = list(range(params['min_topics'],
                                      params['max_topics'] + 1))

        for topics in num_topics_range:
            update_task_status(task, detail='Training: %d topics' % topics)
            twitter_lda(output_desc='%s-%d' % (desc_prefix, topics),
                        topics=topics, iteration=params['iterations'],
                        show_console_output=False)

        update_task_status(task, detail='Analyzing: plotting perplexity')
        plot_filename, _ = plot_diff_topics(num_topics_range, desc_prefix,
                                            r'Perplexity is ([\d.]+)',
                                            pipe_encoding)

        user = User.get_by_id(user_id)

        with db.atomic():
            f = File.create(file_type='plot', owner=user,
                            original_name='ldaplot-%s.png' % tag,
                            physical_name=plot_filename,
                            size=os.stat(report_file(plot_filename)).st_size)
            task.plot = f
            task.save()

        report_ids = []

        for topics in num_topics_range:
            update_task_status(task, detail='Analyzing: generating report for '
                                            '%d topics' % topics)
            report = visualization_twlda(params['keyword'],
                                         '%s-%d' % (desc_prefix, topics),
                                         '%s-%d' % (tag, topics),
                                         userinfo_filename, open_browser=False)

            with db.atomic():
                f = File.create(file_type='report', owner=user,
                                original_name=report, physical_name=report,
                                size=os.stat(report_file(report)).st_size)

            report_ids.append(f.id)

        with db.atomic():
            task.reports = json.dumps(report_ids)
            task.save()

        update_task_status(task, code=STATUS_FINISHED, detail='Finished')
    except Exception:
        traceback.print_exc()
        update_task_status(task, code=STATUS_FAILED,
                           detail='Error: %s' % get_exc_line())
    finally:
        logfile.close()

Example #20

0

Show file

def data_retriever(data_source,
                   query,
                   save_filename,
                   *,
                   lang='',
                   proxy=None,
                   remove_duplicates=False,
                   twapi_max=None,
                   twapi_sleep_time=0,
                   twscrape_poolsize=20,
                   twscrape_begindate=None,
                   ghapi_org=None,
                   ghapi_since=None,
                   soapi_begindate=None):
    cl.section('Data Retriever')
    cl.info('Starting to retrieve query: %s, or org: %s' % (query, ghapi_org))
    cl.info('From data source: %s' % data_source)
    cl.info('Using proxy: %s' % proxy)
    cl.info('Remove duplicates: %s' % remove_duplicates)

    if proxy:
        os.environ['HTTP_PROXY'] = proxy
        os.environ['HTTPS_PROXY'] = proxy

    if data_source == 'twitter_standard_api':
        data = twapi_search(query,
                            twapi_max,
                            sleep_time=twapi_sleep_time,
                            lang=lang)
    elif data_source == 'twitterscraper':
        data = twscrape_search(query,
                               lang=lang,
                               poolsize=twscrape_poolsize,
                               begindate=twscrape_begindate)
    elif data_source == 'github_api':
        data = github_issue_org_fetch(ghapi_org, ghapi_since)
    elif data_source == 'stackoverflow_api':
        data = soapi_search(query, begindate=soapi_begindate)
    else:
        cl.error('Data source %r is not implemented' % data_source)
        sys.exit(-1)

    if remove_duplicates:
        data = iterator_aggregate_list(data)
        data_no_duplicate_text = remove_duplicate_text(data)
        cl.info('Exporting data without duplicate text')
        export_csv(data_no_duplicate_text, data_source_file(save_filename))

        save_filename_full = name_with_title_suffix(save_filename, '-full')
        cl.info('Exporting full data')
        export_csv(data, data_source_file(save_filename_full))
    else:
        export_csv(data, data_source_file(save_filename))