def text_preprocessor(input_filename,
                      *,
                      preprocessor_cls='TextPreprocessor',
                      custom_stop_words=None,
                      lem_ignore_patterns=None,
                      remove_duplicates=False):
    cl.section('Text Preprocessor')

    input_filename = data_source_file(input_filename)
    preprocessor_cls = globals()[preprocessor_cls]

    with TimeMeasure('preprocess_text'):
        result = preprocess_csv(input_filename,
                                preprocessor_cls=preprocessor_cls,
                                custom_stop_words=custom_stop_words,
                                lem_ignore_patterns=lem_ignore_patterns)

        if remove_duplicates:
            result = remove_duplicate_text(result)

        result = tuple(result)
        cl.info('Effective data size: %d' % len(result))

    with TimeMeasure('save_preprocessed'):
        save_preprocessed(result, input_filename)
 def wrapper(*args, **kwargs):
     t_start = timeit.default_timer()
     ret = func(*args, **kwargs)
     t_end = timeit.default_timer()
     time_str = pretty_time(t_end - t_start)
     cl.info("Function '%s' cost time: %s" % (func.__name__, time_str))
     return ret
Example #3
0
def twapi_search(query, count, *, sleep_time=0, **kwargs):
    with open(DEBUG_FILENAME, 'w'):
        pass

    last_id = None
    num_fetched = 0

    while count > 0:
        next_count = min(count, MAX_COUNT_PER_REQ)
        next_id = last_id - 1 if last_id is not None else None
        data = list(
            twapi_search_page(query,
                              next_count,
                              next_id,
                              sleep_time=sleep_time,
                              **kwargs))

        if not data:
            cl.warning('No more data can be retrieved, terminating...')
            return

        for item in data:
            last_id = int(item['id'])
            yield item

        num_fetched += len(data)
        count -= len(data)

        cl.info('Current number of records fetched: %d' % num_fetched)
Example #4
0
def retweets_recover(csvfilename):
    cl.section('Retweets Recover')
    cl.info('Recovering file: %s' % csvfilename)

    csvfilename = data_source_file(csvfilename)
    result = recover_from_csv(csvfilename)
    exportfilename = name_with_title_suffix(csvfilename, '-recovered')
    export_csv(result, exportfilename)
    return os.path.basename(exportfilename)
Example #5
0
def overview():
    cl.section('Overview of Labels')
    cl.success('Good job! All test cases passed!')
    cl.warning('Warning! Security update delayed!')
    cl.error('Error! Failed to write file!')
    cl.info('Server listening on port 8888.')
    cl.progress('Downloading package, please wait...')
    cl.plain('Nothing interesting.')
    cl.question('A new version is present, would you like to update? (Y/N)')
def model_analyzer(modeldesc, sourcedesc, *, num_top_words=30,
                   num_top_docs=30, debug=False):
    cl.section('LDA Model Analyzer')
    cl.info('Model description: %s' % modeldesc)
    cl.info('Source description: %s' % sourcedesc)

    with TimeMeasure('load_all'):
        ldamodel, corpus, prep_items, source_texts = load_all(modeldesc,
                                                              sourcedesc)

    with TimeMeasure('analyzing'):
        prep_ids = tuple(item[0] for item in prep_items)
        dictionary = ldamodel.id2word
        num_topics = ldamodel.num_topics
        topics = [{
                    'topic_id': i,
                    'words': get_topic_words(ldamodel, i, num_top_words),
                    'popularity': 0.0,
                    'documents': collections.defaultdict(float)
                } for i in range(num_topics)]

        if debug:
            debugfilename = model_file('ldadoctopics-%s.txt' % modeldesc)
            with open(debugfilename, 'w', encoding='utf-8') as debugfile:
                for index, doc in enumerate(corpus):
                    text_id = prep_ids[index]
                    doc_topics = ldamodel.get_document_topics(doc)
                    text = source_texts[text_id].strip()
                    debugfile.write('%s -> %r, %s\n' % (text_id, doc_topics,
                                                        text))

        term_topics_cache = {}

        for word in dictionary:
            term_topics_cache[word] = ldamodel.get_term_topics(word)

        for index, doc in enumerate(corpus):
            for topic_id, prob in ldamodel.get_document_topics(doc):
                topics[topic_id]['popularity'] += prob

            for word, freq in doc:
                if word not in dictionary:
                    continue

                for topic_id, prob in term_topics_cache[word]:
                    topics[topic_id]['documents'][index] += prob * freq

        for topic in topics:
            topic['documents'] = get_topic_top_docs(topic['documents'],
                                                    num_top_docs,
                                                    prep_ids, source_texts)

        topics = sorted(topics, key=lambda x: x['popularity'], reverse=True)

    with TimeMeasure('export_markdown'):
        export_markdown(modeldesc, sourcedesc, topics)
Example #7
0
def win():
    cl.success('Congratulations! You solved all the challenges!')
    cl.info(
        "Now here is a gift for you. You can choose a callback function to call "
        "(e.g. try 'rainbow_fart')! Hope you can find the final flag through this!"
    )
    callback = get_callback()
    cl.progress(f'Executing callback {callback!r} for you...')
    exec(f'{callback}()')  # pylint: disable=exec-used # nosec
    bye()
def twlda_multiple_run(num_topics_range,
                       iteration,
                       desc_prefix,
                       show_console_output=True):
    cl.section('Twitter-LDA Multiple Run')

    for topics in num_topics_range:
        cl.info('Running with %d topics' % topics)
        twitter_lda(output_desc='%s-%d' % (desc_prefix, topics),
                    topics=topics,
                    iteration=iteration,
                    show_console_output=show_console_output)
Example #9
0
def random_sampler(csvfilename, amount):
    cl.section('Data Random Sampler')
    cl.info('Random sampling file: %s' % csvfilename)
    cl.info('Amount: %d' % amount)

    csvfilename = data_source_file(csvfilename)
    data = list(csv_reader(csvfilename))

    random.shuffle(data)
    data = data[:amount]

    exportfilename = name_with_title_suffix(csvfilename, '-sample-%d' % amount)
    export_csv(data, exportfilename)
Example #10
0
def show_level_stats(level_status, items_per_row=5):
    cl.newline()
    cl.section('Level stats:')
    field_width = len(str(NUM_LEVELS))
    rows = math.ceil(NUM_LEVELS / items_per_row)
    for row in range(rows):
        cl.info(' '.join(
            show_level_block(x + 1, field_width, level_status[x])
            for x in range(row * items_per_row,
                           min((row + 1) * items_per_row, NUM_LEVELS))))
    cl.progress(f'Your progress: {sum(level_status)}/{NUM_LEVELS}')
    check_milestones(level_status)
    cl.newline()
Example #11
0
def main():
    cl.section('Welcome to Python Challenges')
    cl.info(f'Python version: {PYTHON_VERSION}')
    level_status = [False] * NUM_LEVELS
    while True:
        show_level_stats(level_status)

        cl.info(f'Enter a level number (1-{NUM_LEVELS}) to solve a level, '
                'or enter 0 to view source code')
        level_number = get_level_number()

        if level_number == 0:
            print(SOURCE, end='')
            continue

        if level_status[level_number - 1]:
            cl.success('You already solved this level')
            continue

        level_func = globals()[f'level_{level_number}']
        answer = get_input(f'Your answer for level {level_number}: ')

        timer = threading.Timer(CHALLENGE_TIMEOUT, die, args=('Timeout!', ))
        timer.start()

        try:
            global_check(answer)
            answer = ast.literal_eval(answer.strip())
        except Exception:  # pylint: disable=broad-except
            timer.cancel()
            cl.error('Wrong answer')
            if DEBUG_MODE:
                traceback.print_exc(file=sys.stdout)
            continue

        try:
            level_func(answer)
        except Exception:  # pylint: disable=broad-except
            timer.cancel()
            cl.error('Wrong answer')
            if DEBUG_MODE:
                traceback.print_exc(file=sys.stdout)
            continue

        timer.cancel()
        cl.success('Correct answer')
        level_status[level_number - 1] = True
Example #12
0
def twitter_lda(*, output_desc, topics, iteration, alpha_g=None,
                beta_word=0.01, beta_b=0.01, gamma=20,
                show_console_output=True):
    cl.section('Twitter-LDA Runner')
    cl.info('Output description: %s' % output_desc)

    assert re.fullmatch(r'[-_0-9a-zA-Z]+', output_desc)

    if alpha_g is None:
        alpha_g = 50 / topics

    set_parameters(topics, alpha_g, beta_word, beta_b, gamma, iteration)

    with TimeMeasure('Twitter-LDA training'):
        run_twlda(show_console_output=show_console_output)

    move_result(output_desc)
Example #13
0
def measure_coherence(model, texts, corpus, dictionary):
    cm = CoherenceModel(model=model, corpus=corpus, dictionary=dictionary,
                        coherence='u_mass')
    u_mass = cm.get_coherence()

    cm = CoherenceModel(model=model, texts=texts, dictionary=dictionary,
                        coherence='c_v')
    c_v = cm.get_coherence()

    cm = CoherenceModel(model=model, texts=texts, dictionary=dictionary,
                        coherence='c_uci')
    c_uci = cm.get_coherence()

    cm = CoherenceModel(model=model, texts=texts, dictionary=dictionary,
                        coherence='c_npmi')
    c_npmi = cm.get_coherence()

    cl.info('Topic coherence: u_mass = %f, c_v = %f, c_uci = %f, c_npmi = %f'
            % (u_mass, c_v, c_uci, c_npmi))
def main():
    tag = time.strftime('java-%Y%m%d%H%M%S')
    tweets_file_recovered = 'twdata-java-recovered.csv'
    userinfo_file = 'twusers-java.csv'
    num_topics_range = list(range(MIN_TOPICS, MAX_TOPICS + 1))

    # Preprocess
    retry_until_success(text_preprocessor_twlda,
                        tweets_file_recovered[:-4],
                        tweet_min_length=2,
                        user_min_tweets=1,
                        remove_duplicates=True)

    # Train (with different number of topics)
    for topics in num_topics_range:
        cl.info('Running with %d topics' % topics)
        retry_until_success(twitter_lda,
                            output_desc='java-%d' % topics,
                            topics=topics,
                            iteration=ITERATIONS,
                            show_console_output=True)

    # Analyze (Perplexity Plot + HTML Reports + Compress)
    report_files = []
    plot_file, minima_points = plot_diff_topics(num_topics_range, 'java',
                                                r'Perplexity is ([\d.]+)',
                                                pipe_encoding)
    report_files.append(plot_file)
    report_points = minima_points if REPORT_ONLY_MINIMA else num_topics_range

    for topics in report_points:
        report_files.append(
            visualization_twlda(KEYWORD,
                                'java-%d' % topics,
                                '%s-%d' % (tag, topics),
                                userinfo_file,
                                open_browser=False))
    compress_report_files(tag, report_files)
        # Test filtering by category. Check whether result is in a correct format.
        Select(self.driver.find_element_by_id(
            'filterCategory')).select_by_index(1)
        self.driver.find_element_by_id('btnFilterCategory').click()
        self.wait.until(
            EC.visibility_of_element_located((By.ID, 'filterCategoryResult')))
        self.assertIn(
            '购买人数',
            self.driver.find_element_by_id('filterCategoryPerson').text)
        self.assertIn(
            '总销量',
            self.driver.find_element_by_id('filterCategoryQuantity').text)
        self.assertIn(
            '总金额',
            self.driver.find_element_by_id('filterCategoryPrice').text)

    @classmethod
    def tearDownClass(cls):
        '''Stop the web driver only once.
        Doing this repeatedly in `tearDown()` could be time-consuming.'''
        cls.driver.quit()


if __name__ == '__main__':
    # Intercept the output from unittest and display it in the end.
    testsuite = unittest.TestLoader().loadTestsFromTestCase(TestBookStore)
    with io.StringIO() as f:
        unittest.TextTestRunner(stream=f).run(testsuite)
        cl.info('Message from unittest:')
        print(f.getvalue(), end='')
Example #16
0
def demo1():
    cl.section('Demo 1')

    cl.info('Test program started.')

    with cl.progress('Running test case 1...', cl.PROGRESS_SPIN, erase=True):
        time.sleep(3)
    cl.success('Test case 1: Passed')

    with cl.progress('Running test case 2...', cl.PROGRESS_SPIN, erase=True):
        time.sleep(3)
    cl.success('Test case 2: Passed')

    with cl.progress('Running test case 3...', cl.PROGRESS_SPIN, erase=True):
        time.sleep(3)
    cl.success('Test case 3: Passed')

    with cl.progress('Running test case 4...', cl.PROGRESS_SPIN, erase=True):
        time.sleep(3)
    cl.error('Test case 4: Failed')

    cl.info('Input: 1111')
    cl.info('Expected output: 2222')
    cl.info('Got: 3333')

    cl.section('Test Result')
    cl.info('3 out of 4 test cases passed.')
    cl.info('Pass rate: 75%')
Example #17
0
def lda_topic_model(input_filename, keyword, size, *, num_topics,
                    iterations=50, passes=1, chunksize=2000, eval_every=10,
                    verbose=False, gamma_threshold=0.001, filter_no_below=5,
                    filter_no_above=0.5, filter_keep_n=100000,
                    open_browser=True):
    cl.section('LDA Topic Model Training')
    cl.info('Keyword: %s' % keyword)
    cl.info('Data size: %d' % size)
    cl.info('Number of topics: %d' % num_topics)
    cl.info('Iterations: %d' % iterations)
    cl.info('Passes: %d' % passes)
    cl.info('Chunk size: %d' % chunksize)
    cl.info('Eval every: %s' % eval_every)
    cl.info('Verbose: %s' % verbose)
    cl.info('Gamma Threshold: %f' % gamma_threshold)
    cl.info('Filter no below: %d' % filter_no_below)
    cl.info('Filter no above: %f' % filter_no_above)
    cl.info('Filter keep n: %d' % filter_keep_n)

    assert re.fullmatch(r'[-_0-9a-zA-Z+]+', keyword)

    input_filename = data_source_file(input_filename)
    description = '%s-%d-%d-%dx%d-%s' % (keyword, size, num_topics, iterations,
                                         passes, time.strftime('%Y%m%d%H%M%S'))

    if verbose:
        log_filename = log_file('ldalog-%s.log' % description)
        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                            level=logging.DEBUG, filename=log_filename)
        cl.info('Writing logs into file: %s' % log_filename)

    with TimeMeasure('load_preprocessed_text'):
        preprocessed_texts = file_read_json(input_filename)
        preprocessed_texts = [item[1] for item in preprocessed_texts]

    with TimeMeasure('gen_dict_corpus'):
        cl.progress('Generating dictionary and corpus...')

        dictionary = Dictionary(preprocessed_texts, prune_at=None)
        dictionary.filter_extremes(no_below=filter_no_below,
                                   no_above=filter_no_above,
                                   keep_n=filter_keep_n)
        dictionary.compactify()

        corpus = [dictionary.doc2bow(text) for text in preprocessed_texts]

        corpusfilename = model_file('ldacorpus-%s.json' % description)
        file_write_json(corpusfilename, corpus)
        cl.success('Corpus saved as: %s' % corpusfilename)

    with TimeMeasure('training'):
        cl.progress('Performing training...')

        with NoConsoleOutput():
            ldamodel = LdaMulticore(corpus, workers=N_WORKERS,
                                    id2word=dictionary, num_topics=num_topics,
                                    iterations=iterations, passes=passes,
                                    chunksize=chunksize, eval_every=eval_every,
                                    gamma_threshold=gamma_threshold,
                                    alpha='symmetric', eta='auto')

        cl.success('Training finished.')

    with TimeMeasure('save_model'):
        modelfilename = 'ldamodel-%s' % description
        ldamodel.save(model_file(modelfilename))
        cl.success('Model saved as: %s' % modelfilename)

    with TimeMeasure('measure_coherence'):
        cl.progress('Measuring topic coherence...')
        measure_coherence(ldamodel, preprocessed_texts, corpus, dictionary)

    with TimeMeasure('vis_save'):
        cl.progress('Preparing visualization...')
        vis = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)
        htmlfilename = 'ldavis-%s.html' % description
        htmlfilename = report_file(htmlfilename)
        pyLDAvis.save_html(vis, htmlfilename)
        cl.success('Visualized result saved in file: %s' % htmlfilename)

    if open_browser:
        open_html_in_browser(htmlfilename)
 def __exit__(self, type_, value, trace):
     self.t_end = timeit.default_timer()
     time_str = pretty_time(self.t_end - self.t_start)
     cl.info("Procedure '%s' cost time: %s" % (self.name, time_str))
Example #19
0
def training_task_process(task_id, user_id, tweets_filename, userinfo_filename,
                          tag, params):
    task = get_task_by_id(task_id)
    logfilename = 'tasklog-%d.log' % task_id

    try:
        logfile = open(log_file(logfilename), 'w', encoding='utf-8')
    except Exception:
        traceback.print_exc()
        update_task_status(task, code=STATUS_FAILED,
                           detail='Error: %s' % get_exc_line())
        return

    try:
        sys.stdout = logfile
        sys.stderr = logfile

        cl.config(color_span=0)
        cl.info('Task started, pid is %d' % os.getpid())

        update_task_status(task, code=STATUS_RUNNING, detail='Preprocessing')
        text_preprocessor_twlda(tweets_filename[:-4], tweet_min_length=2,
                                user_min_tweets=1, remove_duplicates=True)

        desc_prefix = '%d-%s' % (user_id, tag)
        num_topics_range = list(range(params['min_topics'],
                                      params['max_topics'] + 1))

        for topics in num_topics_range:
            update_task_status(task, detail='Training: %d topics' % topics)
            twitter_lda(output_desc='%s-%d' % (desc_prefix, topics),
                        topics=topics, iteration=params['iterations'],
                        show_console_output=False)

        update_task_status(task, detail='Analyzing: plotting perplexity')
        plot_filename, _ = plot_diff_topics(num_topics_range, desc_prefix,
                                            r'Perplexity is ([\d.]+)',
                                            pipe_encoding)

        user = User.get_by_id(user_id)

        with db.atomic():
            f = File.create(file_type='plot', owner=user,
                            original_name='ldaplot-%s.png' % tag,
                            physical_name=plot_filename,
                            size=os.stat(report_file(plot_filename)).st_size)
            task.plot = f
            task.save()

        report_ids = []

        for topics in num_topics_range:
            update_task_status(task, detail='Analyzing: generating report for '
                                            '%d topics' % topics)
            report = visualization_twlda(params['keyword'],
                                         '%s-%d' % (desc_prefix, topics),
                                         '%s-%d' % (tag, topics),
                                         userinfo_filename, open_browser=False)

            with db.atomic():
                f = File.create(file_type='report', owner=user,
                                original_name=report, physical_name=report,
                                size=os.stat(report_file(report)).st_size)

            report_ids.append(f.id)

        with db.atomic():
            task.reports = json.dumps(report_ids)
            task.save()

        update_task_status(task, code=STATUS_FINISHED, detail='Finished')
    except Exception:
        traceback.print_exc()
        update_task_status(task, code=STATUS_FAILED,
                           detail='Error: %s' % get_exc_line())
    finally:
        logfile.close()
Example #20
0
def data_retriever(data_source,
                   query,
                   save_filename,
                   *,
                   lang='',
                   proxy=None,
                   remove_duplicates=False,
                   twapi_max=None,
                   twapi_sleep_time=0,
                   twscrape_poolsize=20,
                   twscrape_begindate=None,
                   ghapi_org=None,
                   ghapi_since=None,
                   soapi_begindate=None):
    cl.section('Data Retriever')
    cl.info('Starting to retrieve query: %s, or org: %s' % (query, ghapi_org))
    cl.info('From data source: %s' % data_source)
    cl.info('Using proxy: %s' % proxy)
    cl.info('Remove duplicates: %s' % remove_duplicates)

    if proxy:
        os.environ['HTTP_PROXY'] = proxy
        os.environ['HTTPS_PROXY'] = proxy

    if data_source == 'twitter_standard_api':
        data = twapi_search(query,
                            twapi_max,
                            sleep_time=twapi_sleep_time,
                            lang=lang)
    elif data_source == 'twitterscraper':
        data = twscrape_search(query,
                               lang=lang,
                               poolsize=twscrape_poolsize,
                               begindate=twscrape_begindate)
    elif data_source == 'github_api':
        data = github_issue_org_fetch(ghapi_org, ghapi_since)
    elif data_source == 'stackoverflow_api':
        data = soapi_search(query, begindate=soapi_begindate)
    else:
        cl.error('Data source %r is not implemented' % data_source)
        sys.exit(-1)

    if remove_duplicates:
        data = iterator_aggregate_list(data)
        data_no_duplicate_text = remove_duplicate_text(data)
        cl.info('Exporting data without duplicate text')
        export_csv(data_no_duplicate_text, data_source_file(save_filename))

        save_filename_full = name_with_title_suffix(save_filename, '-full')
        cl.info('Exporting full data')
        export_csv(data, data_source_file(save_filename_full))
    else:
        export_csv(data, data_source_file(save_filename))