def main(data_folder_path): scispacy_parser = scispacy_util.SciSpaCyParser() train_path = os.path.join(data_folder_path, "train") dev_path = os.path.join(data_folder_path, "dev") test_path = os.path.join(data_folder_path, "test") # parse train set conll_parser = ConllParser(train_path, scispacy_parser) conll_parser.parse_text_files_to_conll_format() # publication_text_path = os.path.join(conll_parser.text_files_path, "2FRKK6MN.pdf.txt") # with open(publication_text_path) as publication_text_file: # full_text = publication_text_file.read() # doc = conll_parser.scispacy_parser.scispacy_create_doc(full_text) # conll_parser.build_publication_to_datasets_and_mentions() # # print(conll_parser.publication_to_datasets_and_mentions['2FRKK6MN']) # # m = conll_parser.build_match_index_to_tag(doc, conll_parser.publication_to_datasets_and_mentions['2FRKK6MN'], '2FRKK6MN') # print(m) # exit() conll_parser.parse_text_files_to_conll_format() # parse dev set conll_parser = ConllParser(dev_path, scispacy_parser) conll_parser.parse_text_files_to_conll_format() # parse test set conll_parser = ConllParser(test_path, scispacy_parser) conll_parser.parse_text_files_to_conll_format()
def __init__(self, train_path, dev_path, kb_path, test_path = None): # path to the data folder for the train set self.train_path = train_path # path to the data folder for the dev set self.dev_path = dev_path # path to the json kb file self.kb_path = kb_path # optional path to the data folder for the test set # if this argument is passed in, the model will use mentions from the dev # and train set to make predictions on the test set. # Otherwise it will use mentions from the train set to make predictions on the # test set self.test_path = test_path # set of unique mentions in the dev set self._dev_set_mentions = set() self._build_dev_set_mentions() # set of unique mentions in the train set self._train_set_mentions = set() self._build_train_set_mentions() # set of unique mentions in the entire kb self._all_mentions = set() self._build_all_mentions() # dictionary mapping dataset id to a set of mentions of that dataset self._id_to_mentions = {} self._build_id_to_mentions() # set of english stopwords self._stopwords = set(stopwords.words('english')) # an instance of a scispacy parser self._scispacy_parser = scispacy_util.SciSpaCyParser() # dictionary mapping mention to the number of datasets it is a mention for self._mention_dataset_count = {} # the total number of datasets self._dataset_count = 0 self._build_mention_dataset_count() # precompile mention regexes self._dataset_id_to_regexes = {} for dataset_id in self._id_to_mentions: compiled_res = [] for mention in self._id_to_mentions[dataset_id]: mention_patterns = self._build_mention_patterns(mention) for pattern in mention_patterns: compiled_re = re.compile(pattern) compiled_res.append(compiled_re) self._dataset_id_to_regexes[dataset_id] = compiled_res
def main(data_folder_path): scispacy_parser = scispacy_util.SciSpaCyParser() train_path = os.path.join(data_folder_path, "train") dev_path = os.path.join(data_folder_path, "dev") test_path = os.path.join(data_folder_path, "test") # parse train set conll_parser = ConllParser(train_path, scispacy_parser) conll_parser.parse_text_files_to_conll_format() # parse dev set conll_parser = ConllParser(dev_path, scispacy_parser) conll_parser.parse_text_files_to_conll_format() # parse test set conll_parser = ConllParser(test_path, scispacy_parser) conll_parser.parse_text_files_to_conll_format()
def __init__(self, train_path, dev_path, sage_methods_path, leipzig_word_counts_path): # path to the data folder for the train set self.train_path = train_path # path to the data folder for the dev set self.dev_path = dev_path # read the list of sage methods and prepare a regex to match them. sage_method_entries = json.load(open(sage_methods_path, mode='rt'))["@graph"] method_names = [] for entry in sage_method_entries: if "skos:prefLabel" in entry: method_names.append(entry["skos:prefLabel"]["@value"]) if "skos:altLabel" in entry: if type(entry["skos:altLabel"]) == list: for label in entry["skos:altLabel"]: method_names.append(label["@value"]) else: method_names.append(entry["skos:altLabel"]["@value"]) # lowercase and remove duplicates. method_names = [ name for name in set([name.lower() for name in method_names]) ] # remove very short names. method_regexes = [ re.escape(method_name) for method_name in method_names ] methods_regex_string = r'\b(?P<method_name>' + '|'.join( method_regexes) + r')\b' # to debug the regex: print(methods_regex_string) self.sage_methods_regex = re.compile(methods_regex_string, re.IGNORECASE) # set of english stopwords self._stopwords = set(stopwords.words('english')) # an instance of a scispacy parser self._scispacy_parser = scispacy_util.SciSpaCyParser() # read word counts in the Leipzig corpus. self._read_leipzig_word_counts_file(leipzig_word_counts_path)
def generate_citations_from_ner_mentions( ner_mentions: List[Dict[str, Union[int, str, float]]], kb_path: str): """Generate candidate citations for the mentions produced by the ner model by using TFIDF weighted overlap with dataset titles @param ner_mentions: list of the ner_mentions @param kb_path: path to the knowledge base of datasets """ nltk_stopwords = set(stopwords.words('english')) scispacy_parser = scispacy_util.SciSpaCyParser() substring_matches = set() tfidf_vectorizer = text_utils.get_tfidf_vectorizer() with open(kb_path) as kb_file_: kb = json.load(kb_file_) dataset_titles = [] tokenized_dataset_titles = [] dataset_ids = [] dataset_id_to_title = {} for dataset in tqdm(kb, desc="processing kb"): dataset_title = text_utils.text_preprocess(dataset["title"]) dataset_id = dataset["data_set_id"] dataset_titles.append(dataset_title) tokenized_dataset_titles.append(dataset_title.split(" ")) dataset_ids.append(dataset_id) dataset_id_to_title[dataset_id] = dataset_title.split(" ") output_citations = [] num_candidates = [] i = 0 mention_citations = [] for mention in tqdm(ner_mentions, desc="Generating candidates from ner mentions"): publication_id = mention["publication_id"] mention_text = mention["mention"] instance = mention["instance"] if len(instance) - len(mention_text.split()) < 5: continue if len(mention_text.split()) == 1 and not mention_text.isupper(): continue parsed_sentence = scispacy_parser.scispacy_create_doc( ' '.join(instance)) pos_counts = defaultdict(int) for t in parsed_sentence: pos_counts[t.pos_] += 1 if pos_counts["NOUN"] + pos_counts["VERB"] == 0: continue if (pos_counts["NUM"] + pos_counts["SYM"] + pos_counts["PUNCT"] ) > 0.4 * len(parsed_sentence) and pos_counts["VERB"] == 0: continue mention_citations.append({ "publication_id": publication_id, "mention": mention_text, "score": mention["score"] }) mention_text = text_utils.text_preprocess(mention_text) dataset_candidates = text_utils.get_substring_candidates( dataset_ids, dataset_titles, tokenized_dataset_titles, mention_text, instance, nltk_stopwords, scispacy_parser, tfidf_vectorizer) num_candidates.append(0) sorted_candidates = [] for dataset_id, match_count in zip(dataset_candidates[0], dataset_candidates[1]): sorted_candidates.append((dataset_id, match_count)) sorted_candidates = sorted(sorted_candidates, key=lambda x: x[1], reverse=True) filtered_candidates = [] for candidate in sorted_candidates: score = candidate[1] if score > 0.0: filtered_candidates.append((candidate[0], score)) for top_candidate in range(0, min(30, len(filtered_candidates))): if sorted_candidates != []: num_candidates[i] += 1 output_dict = {} output_dict["publication_id"] = publication_id output_dict["data_set_id"] = sorted_candidates[top_candidate][ 0] output_dict["score"] = sorted_candidates[top_candidate][1] output_dict["mention_list"] = [mention["mention"]] output_citations.append(output_dict) i += 1 print("Num mentions:", len(num_candidates)) print("Average candidates per mention:", np.mean(num_candidates)) print("Min, median, max candidates per mention:", np.min(num_candidates), np.median(num_candidates), np.max(num_candidates)) print("unique:", sum(np.unique(num_candidates, return_counts=True)[1])) return output_citations, mention_citations
from s2base import scispacy_util from tqdm import tqdm import re from create_sgtb_dataset import get_scispacy_doc import logging #import nltk #nltk.download('stopwords') logging.basicConfig(level=logging.ERROR) # the path to the test publications.json PUB_PATH = os.path.abspath(os.path.join("data", "test", "publications.json")) # the path to the test text files TEXT_FILES_PATH = os.path.abspath(os.path.join("data", "test")) # an instance of SciSpaCyParser SCISPACY_PARSER = scispacy_util.SciSpaCyParser() def create_conll_line(token): """Create one line of the output conll file @param token: the token for the line being created @param match_index_to_tag: the dictionary mapping token index to entity tag """ word = token.text pos = token.pos_ tag = "O" linking_tag = "_" entity_tag = "O" output_line = word + " " + pos + " " + tag + " " + entity_tag
def create_dataset_input(rule_based_candidates, mention_context_cache_path, data_folder_path, overall_output_path=None, is_test=False, output_path=None, overwrite_dataset=False): """Function to take in the rule based candidates and create the input format for the SGTB model. This function is intended to be used for processing test data, as the main function in this file will convert and save train, dev, and test output. @param rule_based_candidates: a list of candidates from the rule based model @param mention_context_cache_path: path to a dictionary mapping <pub_id>:<mention_text> pairs to all contexts @param data_folder_path: path to the data folder @param overall_output_path: path to the overall output folder (optional, used for SGTB training) @param is_test: parameter indicating whether or not the data being processed is test data @param output_path: the path to write the output to (if not processing test data) @param overwrite_dataset: whether or not to overwrite the existing dataset (will be true for train and false for dev and test) """ scispacy_parser = scispacy_util.SciSpaCyParser() prior_entity_probs = compute_entity_probabilities() prior_entity_given_mention_probs = compute_entity_given_mention_probs() prior_mention_given_entity_probs = compute_mention_given_entity_probs() glove_path = os.path.abspath( os.path.join("project", "data", "glove", "glove.6B.50d.txt")) with open(glove_path, "r") as lines: glove = { line.split()[0]: np.array([float(value) for value in line.split()[1:]]) for line in lines } # I haven't run the experiments to tell if having a cache actually helps or not, it takes a while to load # the cache when it is used # if is_test: # mention_context_cache = {} # else: # try: # print("Loading cache...") # mention_context_cache = joblib.load(mention_context_cache_path)["cache"] # print("Cache loaded...") # except: # mention_context_cache = {} mention_context_cache = {} kb_path = os.path.abspath(os.path.join("project", "data", "data_sets.json")) with open(kb_path) as kb_file: kb_json = json.load(kb_file) dataset_id_to_kb_entry = {} for dataset in kb_json: dataset_id_to_kb_entry[dataset["data_set_id"]] = dataset matcher = Matcher(scispacy_parser.nlp.vocab) section_matcher = Matcher(scispacy_parser.nlp.vocab) for section_name in SECTION_STRINGS: section_matcher.add(section_name, None, [{ "LOWER": section_name }, { "ORTH": "\n" }], [{ "LOWER": section_name }, { "ORTH": ":" }], [{ "ORTH": "\n" }, { "LOWER": section_name }, { "ORTH": "." }]) output_docs = [] pub_ids = [] # we will write a new file on the first document, and amend to it afterwards first_doc = True cache_changed = False for pub_id in tqdm(rule_based_candidates, desc='create dataset in create_sgtb_dataset.py'): spacy_doc = get_scispacy_doc(data_folder_path, pub_id, scispacy_parser) pub_ids.append(pub_id) doc_candidates = rule_based_candidates[pub_id] output_doc = [] dataset_id_to_longest_mention_text = {} for row in doc_candidates: mention_text = row["mention"] dataset_id = row["candidate_dataset_ids"][0] if dataset_id in dataset_id_to_longest_mention_text: if len(mention_text) > len( dataset_id_to_longest_mention_text[dataset_id]): dataset_id_to_longest_mention_text[ dataset_id] = mention_text else: dataset_id_to_longest_mention_text[dataset_id] = mention_text for row in doc_candidates: mention_text = row["mention"] dataset_id = row["candidate_dataset_ids"][0] # if mention_text != dataset_id_to_longest_mention_text[dataset_id]: # continue mention_context_cache_key = str(pub_id) + "_" + mention_text if mention_context_cache_key in mention_context_cache: mention_contexts = mention_context_cache[ mention_context_cache_key] else: # search for the mention text in the doc spacy_mention_text = scispacy_parser.scispacy_create_doc( mention_text) pattern = [] for token in spacy_mention_text: pattern.append({"ORTH": token.text}) try: matcher.add("MENTION", None, pattern) matches = list(matcher(spacy_doc)) except ValueError: continue # build and save a mapping of <pub_id>_<mention_text> to all contexts the mention # is found in cache_changed = True mention_contexts = [] token_idx_to_sent_idx = {} sentences_list = list(spacy_doc.sents) context_size = 3 for sent_idx, sent in enumerate(sentences_list): for token in sent: token_idx = token.i token_idx_to_sent_idx[token_idx] = sent_idx for match_id, start, end in matches: sentence_idx = token_idx_to_sent_idx[start] start_context_sent_idx = max(0, sentence_idx - context_size) if start_context_sent_idx == 0: match_sentence_idx = sentence_idx else: match_sentence_idx = context_size end_context_sent_idx = min(len(sentences_list), sentence_idx + context_size) mention_context = sentences_list[ start_context_sent_idx:end_context_sent_idx + 1] sentences_as_docs = [] for sentence in mention_context: sentences_as_docs.append(sentence.as_doc()) start_context_token_idx = sentences_list[ start_context_sent_idx].start end_context_token_idx = sentences_list[end_context_sent_idx - 1].end context_with_offsets = (sentences_as_docs, (start_context_token_idx, end_context_token_idx), (start, end), match_sentence_idx) mention_contexts.append(context_with_offsets) # limit featurizing to first 3 contexts in order of appearance mention_contexts = mention_contexts[:3] mention_context_cache[ mention_context_cache_key] = mention_contexts matcher.remove("MENTION") if mention_contexts != []: output_mention = create_output_mention( is_test, row, prior_entity_probs, prior_entity_given_mention_probs, mention_text, prior_mention_given_entity_probs, dataset_id_to_kb_entry, mention_contexts, scispacy_parser, glove, spacy_doc, section_matcher) output_doc.append(output_mention) # only write output to file if not processing test data if not is_test: if first_doc: with open(output_path, "w") as output_file: json.dump(output_doc, output_file) output_file.write("\n") first_doc = False if overwrite_dataset: with open(overall_output_path, "w") as overall_output_file: json.dump(output_doc, overall_output_file) overall_output_file.write("\n") else: with open(output_path, "a") as output_file: json.dump(output_doc, output_file) output_file.write("\n") with open(overall_output_path, "a") as overall_output_file: json.dump(output_doc, overall_output_file) overall_output_file.write("\n") output_docs.append(json.loads(json.dumps(output_doc))) # if cache_changed and not is_test: # joblib.dump({"cache": mention_context_cache}, mention_context_cache_path) return output_docs, pub_ids