def start_analysis(): """Read input data and start up the analysis.""" input_data_path = FLAGS.input_data LOG.info("Reading input data ...") if FLAGS.input_form == "binary": data_set = read_binary_input(input_data_path) elif FLAGS.input_form == "text": data_set = read_text_input(input_data_path) elif FLAGS.input_form == "json": data_set = read_json_input(input_data_path) else: LOG.error( "Unknown input_form. Needs to be 'binary', 'text', or 'json'.") LOG.info('Read %s sequences', len(data_set.sequences)) if data_set.logged_method_name: PFE_METHODS.append( logged_pfe_method.for_name(data_set.logged_method_name)) LOG.info("Preparing input data.") # the sequence proto's need to be serialized since they are being # sent to another process. sequences = [ sequence.SerializeToString() for sequence in data_set.sequences if languages.should_keep(sequence.language) ] segmented_sequences, segment_size = segment_sequences( sequences, FLAGS.parallelism * 2) LOG.info("Running simulations on %s sequences.", len(sequences)) if FLAGS.parallelism > 1: with Pool(FLAGS.parallelism) as pool: results = merge_results(pool.map(do_analysis, segmented_sequences), segment_size) else: results = merge_results([do_analysis(s) for s in segmented_sequences], segment_size) if results.failed_indices: LOG.info("%s sequences dropped due to errors in simulation.", len(results.failed_indices)) if FLAGS.failed_indices_out: write_failed_indices(results.failed_indices) LOG.info("Formatting output.") results = to_protos(results.totals_by_method, cost.cost) results_proto = result_pb2.AnalysisResultProto() for method_result in results: results_proto.results.append(method_result) return results_proto
def main(argv): """Runs the analysis.""" del argv # Unused. data_set = read_binary_input(FLAGS.input_data) sequence_list = [] for seq in data_set.sequences: if not languages.should_keep(seq.language): continue if not sample(): continue sequence_list.append(seq) del data_set.sequences[:] data_set.sequences.extend(sequence_list) sys.stdout.buffer.write(data_set.SerializeToString())
def test_should_keep_default(self): self.assertTrue(languages.should_keep("hello"))
def test_script_category_takes_priority(self): self.assertFalse(languages.should_keep("en")) self.assertTrue(languages.should_keep("ja")) self.assertTrue(languages.should_keep("zh"))
def test_with_invalid_script_category(self): self.assertTrue(languages.should_keep("en")) self.assertTrue(languages.should_keep("ja")) self.assertTrue(languages.should_keep("zh"))
def test_with_language_filter(self): self.assertTrue(languages.should_keep("en")) self.assertTrue(languages.should_keep("ja")) self.assertFalse(languages.should_keep("zh"))