print("Avg Solid $", avg_solid_ties) print("\n>>> SHOWING STAGE 4 EXAMPLES") """ Stage 4: Exporting CSV or TSV Excel Code challenge - export data """ # Exporting CSV Files print("4.a Exporting CSV Files") from my_utils import write_to_file ## Export Hermes Ties write_to_file("_data/hermes.csv", hermes_ties) ## Export JCrew Ties write_to_file("_data/jcrew.csv", jcrew_ties) ## numpy version print("ERROR line 173") #numpy.savetxt("numpy_woolTies.csv", wool_ties, delimiter=",", fmt="%s") # further filter and combine with savetxt solid_silk_ties = filter_col_by_string(data_from_csv, "print", "_solid") print("ERROR line 178") #numpy.savetxt("numpy_solidSilkTies.csv", solid_silk_ties, delimiter=",", fmt="%s") ## Writing more functions to export csv files print("4.b More functions")
foreign_total_gross = table_rows[1].contents[10].text if foreign_total_gross == "n/a": foreign_total_gross = not_available return foreign_opening_weekend, foreign_total_gross, n_countries, sw_opening_gross, sw_opening_date if __name__ == "__main__": ''' Output file: BOM_foreign_page_data.csv Format of the file: <film title> <foreign page link> <foreign opening weekend income> <foreign total gross income> <number of countries for which data is available> <swedish opening weekend gross> <swedish opening weekend date> ''' link_list = get_links_from_bom_copypaste() #Just generate link, do NOT verify whether the page exist foreign_page_link_list = get_international_links(link_list) #Try access the link previously generated and parse page foreign_data_list = parse_international_page(foreign_page_link_list) header = [ "bom_movie_name", "bom_foreign_url", "foreign_opening_weekend", "foreign_total_gross", "n_countries", "swedish_opening_weekend_gross", "swedish_opening_weekend_date" ] write_to_file(foreign_data_path, foreign_data_list, header)
artificial_sw_page_link = first_split + to_sw_page + last_split #Check if the page contains "Sweden" in the table sw_page_link, sw_gross = _parse_page(artificial_sw_page_link) logger.info("{} swedish link: {}; swedish gross: {}".format( title, sw_page_link, sw_gross)) new_row = [title, sw_page_link, _clean_gross(sw_gross)] sw_link_gross_list.append(new_row) logger.info("Done") return sw_link_gross_list def _parse_page(link): response = requests.get(link) soup = BeautifulSoup(response.content, "lxml") result = soup.body.findAll(text="Sweden") if len(result) == 0: #No Swedish record is available return not_available, not_available else: #link exist, try to get swedish gross sw_gross = soup.find('tr', {'bgcolor': '#ffff99'}).contents[4].text if len(sw_gross) == 0: sw_gross = not_available return link, sw_gross if __name__ == "__main__": international_link_list = get_links_from_bom_copypaste() sw_link_gross = get_swedish_link(international_link_list) write_to_file(sw_gross_path, sw_link_gross, header)
def main(): logging.set_verbosity(logging.INFO) bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) vocab = read_vocab(FLAGS.vocab_file) processor = NerProcessor(max_len=FLAGS.max_seq_length, vocab=vocab) label_list = processor.get_labels() is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( cluster=None, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) train_examples = None num_train_steps = None num_warmup_steps = None if FLAGS.do_train: train_examples = processor.get_train_examples(FLAGS.train_data, FLAGS.data_augment, is_mask=FLAGS.is_mask) num_train_steps = int( len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) model_fn = model_fn_builder(bert_config=bert_config, num_labels=len(label_list), init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu) estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) if FLAGS.do_train: train_file = os.path.join(FLAGS.output_dir, "train.tf_record") convert_examples_features(train_examples, train_file, FLAGS.max_seq_length) logging.info("***** Running training *****") logging.info(" Num examples = %d", len(train_examples)) logging.info(" Batch size = %d", FLAGS.train_batch_size) logging.info(" Num steps = %d", num_train_steps) train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True) estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) if FLAGS.do_eval: eval_examples = processor.get_dev_examples(FLAGS.eval_data) eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") convert_examples_features(eval_examples, eval_file, FLAGS.max_seq_length) logging.info("***** Running evaluation *****") logging.info(" Num examples = %d", len(eval_examples)) logging.info(" Batch size = %d", FLAGS.eval_batch_size) eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=False) result = estimator.predict(input_fn=eval_input_fn) logging.info("===================predicting=========================") preds = [row for row in result] sents, lengths, _ = get_sent_length_label(eval_examples) write_to_file(preds=preds, lengths=lengths, sent2ids=sents, file=FLAGS.eval_result_file, vocab=vocab) valid(result_file=FLAGS.eval_result_file, label_file=FLAGS.eval_data) if FLAGS.do_predict: predict_examples = processor.get_test_examples(FLAGS.test_data) predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") convert_examples_features(predict_examples, predict_file, FLAGS.max_seq_length) logging.info("***** Running prediction*****") logging.info(" Num examples = %d", len(predict_examples)) logging.info(" Batch size = %d", FLAGS.predict_batch_size) predict_input_fn = file_based_input_fn_builder( input_file=predict_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=False) result = estimator.predict(input_fn=predict_input_fn) logging.info("===================predicting=========================") preds = [row for row in result] sents, lengths, _ = get_sent_length_label(predict_examples) write_to_file(preds=preds, lengths=lengths, sent2ids=sents, file=FLAGS.result_file, vocab=vocab)
print("\n>>> SHOWING STAGE 4 EXAMPLES") """ Stage 4: Exporting CSV or TSV Excel Code challenge - export data """ # Exporting CSV Files print("4.a Exporting CSV Files") from my_utils import write_to_file ## Export Hermes Ties write_to_file("_data/hermes.csv", hermes_ties) ## Export JCrew Ties write_to_file("_data/jcrew.csv", jcrew_ties) ## numpy version print("ERROR line 173") #numpy.savetxt("numpy_woolTies.csv", wool_ties, delimiter=",", fmt="%s") # further filter and combine with savetxt solid_silk_ties = filter_col_by_string(data_from_csv, "print", "_solid") print("ERROR line 178") #numpy.savetxt("numpy_solidSilkTies.csv", solid_silk_ties, delimiter=",", fmt="%s") ## Writing more functions to export csv files print("4.b More functions")
for i in table.find_all('td'): #Look for MPAA reting if re.match(mpaa_regex, str(i.contents[0])): mpaa_rating = i.contents[1].text #Look for production budget if re.match(production_budget_regex, str(i.contents[0])): production_budget = i.contents[1].text #Look for Domestic(us) total gross if re.search(domestic_total_regex, str(i.contents[0])): raw_text = str(i.contents[0].text) us_total_gross = raw_text.split(': ')[1] logger.info( "{} has MPAA: {}; production cost: {}; domestic total: {}, opening weekend: {}" .format(title, mpaa_rating, production_budget, us_total_gross, opening_weekend)) new_row = [ title, international_link, _clean_mpaa(mpaa_rating), _clean_prod_budget(production_budget), _clean_gross(us_total_gross), _clean_gross(opening_weekend) ] mpaa_production_list.append(new_row) return mpaa_production_list if __name__ == "__main__": link_list = get_links_from_bom_copypaste() static_data = get_us_values(link_list) write_to_file(static_data_path, static_data, header)