def prepare_dirs(argv):
     in_dir = argv[0]
     out_dir = argv[1]
     local_dir = '/tmp/idx/'
     if os.path.exists(local_dir):
         shutil.rmtree(local_dir)
     os.mkdir(local_dir)
     util.delete_out_dir(out_dir)
     in_file_names = util.get_file_names(in_dir)
     out_file_names = []
     for file_name in in_file_names:
         out_file_names.append(file_name.replace('.txt', '.idx'))
     return in_dir, out_dir, in_file_names, out_file_names, local_dir
Beispiel #2
0
def main():
    """print title each per one line from the corpus"""

    year = 2014
    # months = ['01', '02', '03', '04', '05', '06', '07']  # 2015-08-05
    months = range(11, 13)
    # months = ['02'] # 2015-08-13
    # months = ['02', '03', '04', '05'], 2015-08-05
    # months = ['03']  # 2015-08-13

    days = xrange(1, 32)
    paths = [
        '/cs/puls/Corpus/Business/Puls/{}/{}/{:2d}/'.format(year, month, day)
        for month in months for day in days
    ]

    collected = 0
    for i, fname in enumerate(get_file_names(paths)):
        if i % 100 == 0:
            logger.info("{} / {}".format(collected, i))

        try:
            title = extract_title(fname)
        except:
            logger.debug('Fail to find title')
            continue

        if not title:  # no title
            continue

        title = normalize_title(title)

        # is not monocase and is English
        if not is_monocase(nltk.word_tokenize(title)) and\
           guessLanguage(title) == "en":
            body = get_document_content_paf(fname)
            if len(body.strip()) > 0:  # non-empty
                collected += 1
                print json.dumps([fname, unicode(title).encode("utf8")])
def main():
    """print title each per one line from the corpus"""
    
    year = 2014
    # months = ['01', '02', '03', '04', '05', '06', '07']  # 2015-08-05
    months = range(11, 13)
    # months = ['02'] # 2015-08-13
    # months = ['02', '03', '04', '05'], 2015-08-05
    # months = ['03']  # 2015-08-13
    
    days = xrange(1, 32)
    paths = ['/cs/puls/Corpus/Business/Puls/{}/{}/{:2d}/'.format(year, month, day)
             for month in months
             for day in days]

    collected = 0
    for i, fname in enumerate(get_file_names(paths)):
        if i % 100 == 0:
            logger.info("{} / {}".format(collected, i))

        try:
            title = extract_title(fname)
        except:
            logger.debug('Fail to find title')
            continue

        if not title:  # no title
            continue
            
        title = normalize_title(title)
        
        # is not monocase and is English
        if not is_monocase(nltk.word_tokenize(title)) and\
           guessLanguage(title) == "en":
            body = get_document_content_paf(fname)
            if len(body.strip()) > 0:  # non-empty
                collected += 1
                print json.dumps([fname, unicode(title).encode("utf8")])
Beispiel #4
0
def main(unused_argv):
    train_data = util.load_train_img(tiling=False)
    train_labels = util.load_train_lbl(tiling=False)
    predict_data = util.load_test_data(tiling=False)

    train_labels = np.around(train_labels)
    train_labels = train_labels.astype('int32')

    # EXPAND to 608 x 608
    train_data = np.pad(train_data, ((0, 0), (104, 104), (104, 104), (0, 0)), 'reflect')
    train_labels = np.pad(train_labels, ((0, 0), (104, 104), (104, 104)), 'reflect')

    # Channel first
    # train_data = np.rollaxis(train_data, -1, 1)
    # predict_data = np.rollaxis(predict_data, -1, 1)

    # neeed to expand the channel axis for the image augmentation
    train_labels = np.expand_dims(train_labels, 3)

    # Create the Estimator
    road_estimator = tf.estimator.Estimator(
        model_fn=cnn_model_fn, model_dir="outputs/road")

    # Train the model
    train_input_fn = tf.estimator.inputs.numpy_input_fn(
        x={"x": train_data},
        y=train_labels,
        batch_size=constants.BATCH_SIZE,
        num_epochs=None,
        shuffle=True)

    road_estimator.train(
        input_fn=train_input_fn,
        max_steps=(constants.N_SAMPLES * constants.NUM_EPOCH) // constants.BATCH_SIZE)

    # road_estimator.train(
    #     input_fn=train_input_fn,
    #     max_steps=10)

    # Predicions
    # Do prediction on test data
    util.create_prediction_dir("predictions_test/")
    file_names = util.get_file_names()

    predict_input_fn = tf.estimator.inputs.numpy_input_fn(
        x={"x": predict_data},
        num_epochs=1,
        shuffle=False,
        batch_size=constants.BATCH_SIZE)

    predictions = road_estimator.predict(input_fn=predict_input_fn)
    res = [p['classes'] for p in predictions]

    for i in range(constants.N_TEST_SAMPLES):
        labels = res[i]
        img = util.label_to_img_full(IMG_SIZE, IMG_SIZE, labels)
        img = util.img_float_to_uint8(img)
        Image.fromarray(img).save('predictions_test/' + file_names[i])

    # Do prediction on train data
    util.create_prediction_dir("predictions_train/")

    predict_input_fn = tf.estimator.inputs.numpy_input_fn(
        x={"x": train_data},
        num_epochs=1,
        shuffle=False,
        batch_size=constants.BATCH_SIZE)

    predictions = road_estimator.predict(input_fn=predict_input_fn)
    res = [p['classes'] for p in predictions]

    for i in range(constants.N_SAMPLES):
        labels = res[i]
        img = util.label_to_img_full(IMG_SIZE, IMG_SIZE, labels)
        img = util.img_float_to_uint8(img)
        Image.fromarray(img).save('predictions_train/satImage_{:03}.png'.format(i + 1))