def get_doc_line(self,doc,line): lines = self.db.get_doc_lines(doc) if line > -1: return lines.split("\n")[line].split("\t")[1] else: non_empty_lines = [line.split("\t")[1] for line in lines.split("\n") if len(line.split("\t"))>1 and len(line.split("\t")[1].strip())] return non_empty_lines[SimpleRandom.get_instance().next_rand(0,len(non_empty_lines)-1)]
def get_doc_line(self, doc, line): lines = self.doc_db.get_doc_lines(doc) if os.getenv("PERMISSIVE_EVIDENCE", "n").lower() in ["y", "yes", "true", "t", "1"]: if lines is None: return "" if line > -1: return lines.split("\n")[line].split("\t")[1] else: non_empty_lines = [ line.split("\t")[1] for line in lines.split("\n") if len(line.split("\t")) > 1 and len(line.split("\t")[1].strip()) ] return non_empty_lines[SimpleRandom.get_instance().next_rand( 0, len(non_empty_lines) - 1)]
def evidence_num_to_text(db: Union[Dict, FeverDocDB], page_id: str, line: int, is_snopes: bool = False): assert isinstance( db, Dict) or not is_snopes, "db should be dictionary for Snopes data" assert isinstance( db, FeverDocDB) or is_snopes, "db should be fever DB for fever data" logger = LogHelper.get_logger("evidence_num_to_text") if is_snopes: return evidence_num_to_text_snopes(db, page_id, line) lines = db.get_doc_lines(page_id) if lines is None: return "" if line > -1: return lines.split("\n")[line].split("\t")[1] else: non_empty_lines = [ line.split("\t")[1] for line in lines.split("\n") if len(line.split("\t")) > 1 and len(line.split("\t")[1].strip()) ] return non_empty_lines[SimpleRandom.get_instance().next_rand( 0, len(non_empty_lines) - 1)]
def train_model(db: FeverDocDB, params: Union[Params, Dict[str, Any]], cuda_device: int, serialization_dir: str, filtering: str) -> Model: """ This function can be used as an entry point to running models in AllenNLP directly from a JSON specification using a :class:`Driver`. Note that if you care about reproducibility, you should avoid running code using Pytorch or numpy which affect the reproducibility of your experiment before you import and use this function, these libraries rely on random seeds which can be set in this function via a JSON specification file. Note that this function performs training and will also evaluate the trained model on development and test sets if provided in the parameter json. Parameters ---------- params: Params, required. A parameter object specifying an AllenNLP Experiment. serialization_dir: str, required The directory in which to save results and logs. """ SimpleRandom.set_seeds() os.makedirs(serialization_dir, exist_ok=True) sys.stdout = TeeLogger(os.path.join(serialization_dir, "stdout.log"), sys.stdout) # type: ignore sys.stderr = TeeLogger(os.path.join(serialization_dir, "stderr.log"), sys.stderr) # type: ignore handler = logging.FileHandler( os.path.join(serialization_dir, "python_logging.log")) handler.setLevel(logging.INFO) handler.setFormatter( logging.Formatter( '%(asctime)s - %(levelname)s - %(name)s - %(message)s')) logging.getLogger().addHandler(handler) serialization_params = deepcopy(params).as_dict(quiet=True) with open(os.path.join(serialization_dir, "model_params.json"), "w") as param_file: json.dump(serialization_params, param_file, indent=4) # Now we begin assembling the required parts for the Trainer. ds_params = params.pop('dataset_reader', {}) dataset_reader = FEVERReader(db, sentence_level=ds_params.pop( "sentence_level", False), wiki_tokenizer=Tokenizer.from_params( ds_params.pop('wiki_tokenizer', {})), claim_tokenizer=Tokenizer.from_params( ds_params.pop('claim_tokenizer', {})), token_indexers=TokenIndexer.dict_from_params( ds_params.pop('token_indexers', {})), filtering=filtering) train_data_path = params.pop('train_data_path') logger.info("Reading training data from %s", train_data_path) train_data = dataset_reader.read(train_data_path) all_datasets = [train_data] datasets_in_vocab = ["train"] validation_data_path = params.pop('validation_data_path', None) if validation_data_path is not None: logger.info("Reading validation data from %s", validation_data_path) validation_data = dataset_reader.read(validation_data_path) all_datasets.append(validation_data) datasets_in_vocab.append("validation") else: validation_data = None logger.info("Creating a vocabulary using %s data.", ", ".join(datasets_in_vocab)) vocab = Vocabulary.from_params( params.pop("vocabulary", {}), Dataset([ instance for dataset in all_datasets for instance in dataset.instances ])) vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) model = Model.from_params(vocab, params.pop('model')) iterator = DataIterator.from_params(params.pop("iterator")) train_data.index_instances(vocab) if validation_data: validation_data.index_instances(vocab) trainer_params = params.pop("trainer") if cuda_device is not None: trainer_params["cuda_device"] = cuda_device trainer = Trainer.from_params(model, serialization_dir, iterator, train_data, validation_data, trainer_params) trainer.train() # Now tar up results archive_model(serialization_dir) return model
from rte.riedel.sent_features import SentenceLevelTermFrequencyFeatureFunction def model_exists(mname): return os.path.exists(os.path.join("models","{0}.model".format(mname))) def str2bool(v): if v.lower() in ('yes', 'true', 't', 'y', '1'): return True elif v.lower() in ('no', 'false', 'f', 'n', '0'): return False else: raise argparse.ArgumentTypeError('Boolean value expected.') if __name__ == "__main__": SimpleRandom.set_seeds() LogHelper.setup() logger = LogHelper.get_logger(__name__) parser = argparse.ArgumentParser() parser.add_argument('db', type=str, help='db file path') parser.add_argument('train', type=str, help='train file path') parser.add_argument('dev', type=str, help='dev file path') parser.add_argument('--test', required=False ,type=str, default=None ,help="test file path") parser.add_argument("--model", type=str, help="model name") parser.add_argument("--sentence",type=str2bool, default=False) parser.add_argument("--filtering",type=str, default=None) args = parser.parse_args()
from retrieval.fever_doc_db import FeverDocDB from retrieval.filter_uninformative import uninformative parser = argparse.ArgumentParser() parser.add_argument('db_path', type=str, help='/path/to/fever.db') args = parser.parse_args() jlr = JSONLineReader() docdb = FeverDocDB(args.db_path) idx = docdb.get_non_empty_doc_ids() idx = list(filter(lambda item: not uninformative(item), tqdm(idx))) r = SimpleRandom.get_instance() with open("data/fever/test.ns.rand.jsonl", "w+") as f: for line in jlr.read("data/fever-data/test.jsonl"): if line["label"] == "NOT ENOUGH INFO": for evidence_group in line['evidence']: for evidence in evidence_group: evidence[2] = idx[r.next_rand(0, len(idx))] evidence[3] = -1 f.write(json.dumps(line) + "\n") with open("data/fever/dev.ns.rand.jsonl", "w+") as f: for line in jlr.read("data/fever-data/dev.jsonl"): if line["label"] == "NOT ENOUGH INFO":