Example #1
0
def download(dataset='uea'):
    """ Downloads the uea data to '/raw/uea'. """
    raw_dir = DATA_DIR + '/raw'
    assert os.path.isdir(raw_dir), "No directory exists at data/raw. Please make one to continue."

    if dataset == 'uea':
        url = 'http://www.timeseriesclassification.com/Downloads/Archives/Multivariate2018_arff.zip'
        save_dir = DATA_DIR + '/raw/UEA'
        zipname = save_dir + '/uea.zip'
    elif dataset == 'ucr':
        url = 'http://www.timeseriesclassification.com/Downloads/Archives/Univariate2018_arff.zip'
        save_dir = DATA_DIR + '/raw/UCR'
        zipname = save_dir + '/ucr.zip'
    elif dataset == 'tsr':
        url = 'https://zenodo.org/record/3902651/files/Monash_UEA_UCR_Regression_Archive.zip?download=1'
        save_dir = DATA_DIR + '/raw/TSR'
        zipname = save_dir + '/tsr.zip'
    else:
        raise ValueError('Can only download uea, ucr or tsr. Was asked for {}.'.format(dataset))

    if os.path.exists(save_dir):
        print('Path already exists at {}. If you wish to re-download you must delete this folder.'.format(save_dir))
        return

    mkdir_if_not_exists(save_dir)

    if len(os.listdir(save_dir)) == 0:
        download_url(url, zipname)
        unzip(zipname, save_dir)
Example #2
0
def main():
    parser = argparse.ArgumentParser(
        prog=os.path.basename(sys.argv[0]),
        formatter_class=argparse.RawDescriptionHelpFormatter,
        description=__doc__)
    parser.add_argument("input", help="ptwiki-compressed-text-folder")
    parser.add_argument("-o",
                        "--output",
                        default="./data/cleaned/",
                        help="directory for extracted files")

    args = parser.parse_args()
    input_dirname = args.input
    output_dirname = args.output
    mkdir_if_not_exists(output_dirname)

    vocab, tokens = set(), 0
    output = OutputSplitter(NextFile(output_dirname), 10 * 1024 * 1024, True)

    num_threds = multiprocessing.cpu_count()
    pool = ThreadPool(num_threds)
    print('Running with {0} threads ...'.format(num_threds))
    job_batch_size = 1000
    reporter = JobsReporter(report_period=1000)
    documents = read_wiki_documents_compressed(input_dirname)

    jobs = grouper(documents, job_batch_size)

    for job in pool.imap(worker_clean_document, jobs):
        for sentences in job:
            for sentence in sentences:
                output.write((sentence + '\n').encode('utf-8'))
                tokens += sentence.count(' ') + 1
                for w in sentence.split():
                    vocab.add(w)
                reporter.complete_job(report=True)
    output.close()
    print('\n')
    print('Tokens: ', tokens)
    print('Vocabulary: ', len(vocab))
def main():
    logging.basicConfig(format='%(levelname)s: %(message)s')
    logging.getLogger().setLevel(logging.INFO)
    parser = argparse.ArgumentParser(
        prog=os.path.basename(sys.argv[0]),
        formatter_class=argparse.RawDescriptionHelpFormatter,
        description=__doc__)
    parser.add_argument("input", help="deps_context_path")
    parser.add_argument("-o",
                        "--output",
                        default="./data/contexts/",
                        help="directory for extracted files")
    parser.add_argument(
        "-m",
        "--model",
        default="./data/models/word2vec/word2vec-s400-w5-m5.bin",
        help="word2vec model to extract vocab")

    args = parser.parse_args()
    output_dirname = args.output
    mkdir_if_not_exists(output_dirname)
    deps_context_path = args.input

    word_vectors = KeyedVectors.load_word2vec_format(args.model, binary=True)
    vocab = set(word_vectors.vocab)
    logging.info('Vocab:\t%d', len(vocab))

    extract_start = time.perf_counter()

    logging.info("Processing wv ...")
    filter_file(deps_context_path, output_dirname, 'wv', vocab)
    logging.info("Processing cv ...")
    filter_file(deps_context_path, output_dirname, 'cv', vocab)
    logging.info("Processing dep.contexts ...")
    filter_file(deps_context_path, output_dirname, 'dep.contexts', vocab)

    extract_duration = time.perf_counter() - extract_start
    logging.info("elapsed %f", extract_duration)
def main():
    parser = argparse.ArgumentParser(
        prog=os.path.basename(sys.argv[0]),
        formatter_class=argparse.RawDescriptionHelpFormatter,
        description=__doc__)
    parser.add_argument("input", help="sqlfile_path")
    default_process_count = max(1, cpu_count() - 1)
    parser.add_argument(
        "--processes",
        type=int,
        default=default_process_count,
        help="Number of processes to use (default %(default)s)")
    parser.add_argument(
        "-b",
        "--batchsize",
        type=int,
        default=50,
        help=
        "The number of sentences to be sended to the parser in each iteration."
    )
    parser.add_argument("-o",
                        "--output",
                        default="./data/contexts/",
                        help="directory for extracted files")

    groupS = parser.add_argument_group('Special')
    groupS.add_argument("-q",
                        "--quiet",
                        action="store_true",
                        help="suppress reporting progress info")
    groupS.add_argument("--debug",
                        action="store_true",
                        help="print debug info")

    args = parser.parse_args()
    output_dirname = args.output
    mkdir_if_not_exists(output_dirname)
    sqlfile_path = args.input
    job_batch_size = args.batchsize

    FORMAT = '%(levelname)s: %(message)s'
    logging.basicConfig(format=FORMAT)

    options = {}
    options['quiet'] = args.quiet
    options['debug'] = args.debug
    options['sqlfile_path'] = sqlfile_path
    options['job_batch_size'] = job_batch_size
    options['output_dirname'] = output_dirname
    createLogger(options['quiet'], options['debug'])

    number_of_pages = get_page_count(sqlfile_path, job_batch_size)
    jobs = [(pageNum, ) for pageNum in range(1, number_of_pages)]

    # process pages
    logging.info("Starting")
    extract_start = time.perf_counter()

    process_count = args.processes
    process_count = max(1, process_count)
    maxsize = 10 * process_count
    # output queue
    output_queue = Queue(maxsize=maxsize)
    worker_count = process_count

    # load balancing
    max_spool_length = 10000
    spool_length = Value('i', 0, lock=False)

    # reduce job that sorts and prints output
    reduce = Process(target=reduce_process,
                     args=(options, output_queue, spool_length))
    reduce.start()

    # initialize jobs queue
    jobs_queue = Queue(maxsize=maxsize)

    # start worker processes
    logging.info("Using %d processes.", worker_count)
    workers = []
    for i in range(worker_count):
        extractor = Process(target=extract_process,
                            args=(options, i, jobs_queue, output_queue))
        extractor.daemon = True  # only live while parent process lives
        extractor.start()
        workers.append(extractor)

    # Mapper process
    page_num = 0
    for page_data in jobs:
        pageNum, = page_data
        # slow down
        delay = 0
        if spool_length.value > max_spool_length:
            # reduce to 10%
            while spool_length.value > max_spool_length / 10:
                time.sleep(10)
                delay += 10
        if delay:
            logging.info('Delay %ds', delay)
        job = (pageNum, )
        jobs_queue.put(job)  # goes to any available extract_process
        page_num += 1

    # signal termination
    for _ in workers:
        jobs_queue.put(None)
    # wait for workers to terminate
    for w in workers:
        w.join()

    # signal end of work to reduce process
    output_queue.put(None)
    # wait for it to finish
    reduce.join()

    extract_duration = time.perf_counter() - extract_start
    extract_rate = (page_num * job_batch_size) / extract_duration
    logging.info(
        "Finished %d-process of %d sentences in %.1fs (%.1f sentences/s)",
        process_count, page_num * job_batch_size, extract_duration,
        extract_rate)
def main():
    parser = argparse.ArgumentParser(
        prog=os.path.basename(sys.argv[0]),
        formatter_class=argparse.RawDescriptionHelpFormatter,
        description=__doc__)
    parser.add_argument("input", help="ptwiki-compressed-text-folder")
    parser.add_argument("-o",
                        "--output",
                        default="./models/",
                        help="directory for extracted files")
    parser.add_argument("-s", "--size", type=int, default=200, help="size")
    parser.add_argument("-w", "--window", type=int, default=5, help="window")
    parser.add_argument("-m",
                        "--mincount",
                        type=int,
                        default=2,
                        help="mincount")
    parser.add_argument("-sg",
                        "--sg",
                        type=int,
                        default=0,
                        help="use skip-gram")

    args = parser.parse_args()
    output_dirname = args.output
    input_dir = args.input
    size = args.size
    sg = args.sg
    window = args.window
    min_count = args.mincount
    output_path = args.output

    mkdir_if_not_exists(output_dirname)

    # '../data/ptwiki-articles-text-preprocessed
    wiki_text_dump_path = input_dir + '/**/*.bz2'
    sentences = MySentences(wiki_text_dump_path)

    mkdir_if_not_exists(output_path)

    # build vocabulary and train model
    model = gensim.models.Word2Vec(sentences,
                                   size=size,
                                   window=window,
                                   min_count=min_count,
                                   sg=sg,
                                   workers=multiprocessing.cpu_count())

    # model.train(documents, total_examples=len(documents), epochs=10)

    # trim unneeded model memory = use (much) less RAM
    # Precompute L2-normalized vectors.
    # If replace is set, forget the original vectors and only keep the normalized ones = saves lots of memory!
    # Note that you cannot continue training after doing a replace. The model becomes effectively read-only = you can call most_similar, similarity etc., but not train.
    model.init_sims(replace=True)

    model_file_name = os.path.join(
        output_path,
        'word2vec-s{0}-w{1}-m{2}-sg{3}'.format(size, window, min_count, sg))
    # model.save(model_file_name)
    model.wv.save_word2vec_format('{0}.bin'.format(model_file_name),
                                  binary=True)

    word_vectors = model.wv
    # word_vectors.save(output_path)

    print(word_vectors.most_similar(positive=['carro'], topn=10))
    print("Most similar to {0}".format(
        word_vectors.most_similar(positive="america")))
    print('CONCLUDED')
def main():
    parser = argparse.ArgumentParser(
        prog=os.path.basename(sys.argv[0]),
        formatter_class=argparse.RawDescriptionHelpFormatter,
        description=__doc__)
    parser.add_argument("input", help="sqlfile_path")
    parser.add_argument(
        "-b",
        "--batchsize",
        type=int,
        default=50,
        help=
        "The number of sentences to be sended to the parser in each iteration."
    )
    parser.add_argument("-mc",
                        "--mincount",
                        type=int,
                        help="ignores all contexts that apears less then")
    parser.add_argument(
        "-wv",
        "--wordvocabcount",
        type=int,
        help=
        "generates the word vocab and ignores all words that apears less then")
    parser.add_argument("-o",
                        "--output",
                        default="./data/contexts/",
                        help="directory for extracted files")
    args = parser.parse_args()
    sqlfile_path = args.input
    job_batch_size = args.batchsize
    output_dirname = args.output
    wordvocabcount = args.wordvocabcount
    mincount = args.mincount
    mkdir_if_not_exists(output_dirname)
    word_vocab_file = os.path.join(output_dirname, 'wordvocabcount')

    cv_all = Counter()
    wv_all = Counter()
    word_filter = None
    output = None
    if wordvocabcount == None:
        with open(word_vocab_file, 'rb') as f:
            word_filter = wv_all = pickle.load(f)
        output = open(os.path.join(output_dirname, 'dep.contexts'), 'wb')
    else:
        mincount = None
    num_threds = multiprocessing.cpu_count()
    pool = multiprocessing.pool.ThreadPool(num_threds)
    print('Running with {0} threads ...'.format(num_threds))
    print('Batch size: {0}'.format(job_batch_size))

    reporter = JobsReporter(batch_size=job_batch_size, report_period=10)

    with sqlite3.connect(sqlfile_path) as conn:
        c = conn.cursor()
        c.execute('SELECT COUNT(*) FROM sentences where palavras IS NOT NULL')
        (total, ) = c.fetchone()
        jobs_number = math.ceil(total / job_batch_size)
        jobs = range(1, jobs_number)
        print('Sentences to be parsed: {0}'.format(total))

        reporter.reset()
        for (cv, wv, batch_result) in pool.imap(
                create_worker_method(sqlfile_path, job_batch_size,
                                     word_filter), jobs):
            cv_all = cv_all + cv
            if wordvocabcount != None:
                wv_all = wv_all + wv
            else:
                output.write('\n'.join(batch_result).encode('utf-8'))
            reporter.complete_job(report=True)
    print('\n')

    if wordvocabcount != None:
        for key, count in dropwhile(
                lambda key_count: key_count[1] >= wordvocabcount,
                wv_all.most_common()):
            del wv_all[key]
        with open(word_vocab_file, 'wb') as f:
            pickle.dump(wv_all, f)

        with open(os.path.join(output_dirname, 'wv'),
                  encoding='utf-8',
                  mode='w') as f:
            for w, count in wv_all.items():
                f.write('{} {}\n'.format(w, count))
        wv_all = None

    if wordvocabcount == None:
        for key, count in dropwhile(lambda key_count: key_count[1] >= mincount,
                                    cv_all.most_common()):
            del cv_all[key]
        with open(os.path.join(output_dirname, 'cv'),
                  encoding='utf-8',
                  mode='w') as f:
            for w, count in cv_all.items():
                f.write('{} {}\n'.format(w, count))
        cv_all = None