Ejemplo n.º 1
0
print("Avg Solid $", avg_solid_ties)

print("\n>>> SHOWING STAGE 4 EXAMPLES")
"""
Stage 4: Exporting

CSV or TSV
Excel
Code challenge - export data
"""

# Exporting CSV Files
print("4.a Exporting CSV Files")
from my_utils import write_to_file
## Export Hermes Ties
write_to_file("_data/hermes.csv", hermes_ties)

## Export JCrew Ties
write_to_file("_data/jcrew.csv", jcrew_ties)

## numpy version
print("ERROR line 173")
#numpy.savetxt("numpy_woolTies.csv", wool_ties, delimiter=",", fmt="%s")

# further filter and combine with savetxt
solid_silk_ties = filter_col_by_string(data_from_csv, "print", "_solid")
print("ERROR line 178")
#numpy.savetxt("numpy_solidSilkTies.csv", solid_silk_ties, delimiter=",", fmt="%s")

## Writing more functions to export csv files
print("4.b More functions")
Ejemplo n.º 2
0
    foreign_total_gross = table_rows[1].contents[10].text
    if foreign_total_gross == "n/a":
        foreign_total_gross = not_available

    return foreign_opening_weekend, foreign_total_gross, n_countries, sw_opening_gross, sw_opening_date


if __name__ == "__main__":
    '''
    Output file: BOM_foreign_page_data.csv
    Format of the file:
    <film title>
    <foreign page link>
    <foreign opening weekend income>
    <foreign total gross income>
    <number of countries for which data is available>
    <swedish opening weekend gross>
    <swedish opening weekend date>
    '''
    link_list = get_links_from_bom_copypaste()
    #Just generate link, do NOT verify whether the page exist
    foreign_page_link_list = get_international_links(link_list)
    #Try access the link previously generated and parse page
    foreign_data_list = parse_international_page(foreign_page_link_list)
    header = [
        "bom_movie_name", "bom_foreign_url", "foreign_opening_weekend",
        "foreign_total_gross", "n_countries", "swedish_opening_weekend_gross",
        "swedish_opening_weekend_date"
    ]
    write_to_file(foreign_data_path, foreign_data_list, header)
Ejemplo n.º 3
0
            artificial_sw_page_link = first_split + to_sw_page + last_split
            #Check if the page contains "Sweden" in the table
            sw_page_link, sw_gross = _parse_page(artificial_sw_page_link)
        logger.info("{} swedish link: {}; swedish gross: {}".format(
            title, sw_page_link, sw_gross))
        new_row = [title, sw_page_link, _clean_gross(sw_gross)]
        sw_link_gross_list.append(new_row)
    logger.info("Done")
    return sw_link_gross_list


def _parse_page(link):
    response = requests.get(link)
    soup = BeautifulSoup(response.content, "lxml")
    result = soup.body.findAll(text="Sweden")
    if len(result) == 0:
        #No Swedish record is available
        return not_available, not_available
    else:
        #link exist, try to get swedish gross
        sw_gross = soup.find('tr', {'bgcolor': '#ffff99'}).contents[4].text
        if len(sw_gross) == 0:
            sw_gross = not_available
        return link, sw_gross


if __name__ == "__main__":
    international_link_list = get_links_from_bom_copypaste()
    sw_link_gross = get_swedish_link(international_link_list)
    write_to_file(sw_gross_path, sw_link_gross, header)
Ejemplo n.º 4
0
def main():
    logging.set_verbosity(logging.INFO)
    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

    vocab = read_vocab(FLAGS.vocab_file)
    processor = NerProcessor(max_len=FLAGS.max_seq_length, vocab=vocab)

    label_list = processor.get_labels()

    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
    run_config = tf.contrib.tpu.RunConfig(
        cluster=None,
        master=FLAGS.master,
        model_dir=FLAGS.output_dir,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
        tpu_config=tf.contrib.tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            num_shards=FLAGS.num_tpu_cores,
            per_host_input_for_training=is_per_host))
    train_examples = None
    num_train_steps = None
    num_warmup_steps = None

    if FLAGS.do_train:
        train_examples = processor.get_train_examples(FLAGS.train_data,
                                                      FLAGS.data_augment,
                                                      is_mask=FLAGS.is_mask)

        num_train_steps = int(
            len(train_examples) / FLAGS.train_batch_size *
            FLAGS.num_train_epochs)
        num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)
    model_fn = model_fn_builder(bert_config=bert_config,
                                num_labels=len(label_list),
                                init_checkpoint=FLAGS.init_checkpoint,
                                learning_rate=FLAGS.learning_rate,
                                num_train_steps=num_train_steps,
                                num_warmup_steps=num_warmup_steps,
                                use_tpu=FLAGS.use_tpu,
                                use_one_hot_embeddings=FLAGS.use_tpu)
    estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.eval_batch_size,
        predict_batch_size=FLAGS.predict_batch_size)

    if FLAGS.do_train:
        train_file = os.path.join(FLAGS.output_dir, "train.tf_record")

        convert_examples_features(train_examples, train_file,
                                  FLAGS.max_seq_length)
        logging.info("***** Running training *****")
        logging.info("  Num examples = %d", len(train_examples))
        logging.info("  Batch size = %d", FLAGS.train_batch_size)
        logging.info("  Num steps = %d", num_train_steps)
        train_input_fn = file_based_input_fn_builder(
            input_file=train_file,
            seq_length=FLAGS.max_seq_length,
            is_training=True,
            drop_remainder=True)
        estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
    if FLAGS.do_eval:
        eval_examples = processor.get_dev_examples(FLAGS.eval_data)
        eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record")
        convert_examples_features(eval_examples, eval_file,
                                  FLAGS.max_seq_length)

        logging.info("***** Running evaluation *****")
        logging.info("  Num examples = %d", len(eval_examples))
        logging.info("  Batch size = %d", FLAGS.eval_batch_size)

        eval_input_fn = file_based_input_fn_builder(
            input_file=eval_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=False)

        result = estimator.predict(input_fn=eval_input_fn)

        logging.info("===================predicting=========================")
        preds = [row for row in result]
        sents, lengths, _ = get_sent_length_label(eval_examples)
        write_to_file(preds=preds,
                      lengths=lengths,
                      sent2ids=sents,
                      file=FLAGS.eval_result_file,
                      vocab=vocab)
        valid(result_file=FLAGS.eval_result_file, label_file=FLAGS.eval_data)

    if FLAGS.do_predict:

        predict_examples = processor.get_test_examples(FLAGS.test_data)

        predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record")

        convert_examples_features(predict_examples, predict_file,
                                  FLAGS.max_seq_length)
        logging.info("***** Running prediction*****")
        logging.info("  Num examples = %d", len(predict_examples))
        logging.info("  Batch size = %d", FLAGS.predict_batch_size)

        predict_input_fn = file_based_input_fn_builder(
            input_file=predict_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=False)

        result = estimator.predict(input_fn=predict_input_fn)
        logging.info("===================predicting=========================")
        preds = [row for row in result]
        sents, lengths, _ = get_sent_length_label(predict_examples)

        write_to_file(preds=preds,
                      lengths=lengths,
                      sent2ids=sents,
                      file=FLAGS.result_file,
                      vocab=vocab)
Ejemplo n.º 5
0
print("\n>>> SHOWING STAGE 4 EXAMPLES")

"""
Stage 4: Exporting

CSV or TSV
Excel
Code challenge - export data
"""

# Exporting CSV Files
print("4.a Exporting CSV Files")
from my_utils import write_to_file
## Export Hermes Ties
write_to_file("_data/hermes.csv", hermes_ties)

## Export JCrew Ties
write_to_file("_data/jcrew.csv", jcrew_ties)

## numpy version
print("ERROR line 173")
#numpy.savetxt("numpy_woolTies.csv", wool_ties, delimiter=",", fmt="%s")

# further filter and combine with savetxt
solid_silk_ties = filter_col_by_string(data_from_csv, "print", "_solid")
print("ERROR line 178")
#numpy.savetxt("numpy_solidSilkTies.csv", solid_silk_ties, delimiter=",", fmt="%s")

## Writing more functions to export csv files
print("4.b More functions")
Ejemplo n.º 6
0
        for i in table.find_all('td'):
            #Look for MPAA reting
            if re.match(mpaa_regex, str(i.contents[0])):
                mpaa_rating = i.contents[1].text
            #Look for production budget
            if re.match(production_budget_regex, str(i.contents[0])):
                production_budget = i.contents[1].text
            #Look for Domestic(us) total gross
            if re.search(domestic_total_regex, str(i.contents[0])):
                raw_text = str(i.contents[0].text)
                us_total_gross = raw_text.split(': ')[1]
        logger.info(
            "{} has MPAA: {}; production cost: {}; domestic total: {}, opening weekend: {}"
            .format(title, mpaa_rating, production_budget, us_total_gross,
                    opening_weekend))
        new_row = [
            title, international_link,
            _clean_mpaa(mpaa_rating),
            _clean_prod_budget(production_budget),
            _clean_gross(us_total_gross),
            _clean_gross(opening_weekend)
        ]
        mpaa_production_list.append(new_row)
    return mpaa_production_list


if __name__ == "__main__":
    link_list = get_links_from_bom_copypaste()
    static_data = get_us_values(link_list)
    write_to_file(static_data_path, static_data, header)