def tsv_dump(self): is_correct_str = "\\N" if self.is_correct is not None: is_correct_str = self.is_correct.__repr__() tsv_line = "\t".join( ["\\N", self.doc_id, str(self.sent_id), list2TSVarray(self.wordidxs), self.id(), self.type, self.entity, list2TSVarray(self.words, quote=True), is_correct_str, list2TSVarray(list(self.features), True)]) return tsv_line
def tsv_dump(self): is_correct_str = "\\N" if self.is_correct is not None: is_correct_str = self.is_correct.__repr__() tsv_line = "\t".join([ "\\N", self.doc_id, str(self.sent_id), list2TSVarray(self.wordidxs), self.id(), self.type, self.entity, list2TSVarray(self.words, quote=True), is_correct_str ]) return tsv_line
def tsv_dump(self): is_correct_str = "\\N" if self.is_correct is not None: is_correct_str = self.is_correct.__repr__() return "\t".join( ["\\N", self.doc_id, str(self.sent_id_1), str(self.sent_id_2), self.id(), self.type, self.mention_1_id, self.mention_2_id, list2TSVarray([x.in_sent_idx for x in self.mention_1_words]), list2TSVarray([x.in_sent_idx for x in self.mention_2_words]), list2TSVarray([x.word for x in self.mention_1_words], True), list2TSVarray([x.word for x in self.mention_2_words], True), is_correct_str])
def tsv_dump(self): is_correct_str = "\\N" if self.is_correct is not None: is_correct_str = self.is_correct.__repr__() return "\t".join([ "\\N", self.doc_id, str(self.sent_id_1), str(self.sent_id_2), self.id(), self.type, self.mention_1_id, self.mention_2_id, list2TSVarray([x.in_sent_idx for x in self.mention_1_words]), list2TSVarray([x.in_sent_idx for x in self.mention_2_words]), list2TSVarray([x.word for x in self.mention_1_words], True), list2TSVarray([x.word for x in self.mention_2_words], True), is_correct_str ])
def process_files(proc_id, input_files, input_dir, output_dir, mode): with open( os.path.realpath("{}/sentences-{}.{}".format( output_dir, proc_id, mode)), 'wt') as out_file: for filename in input_files: # Docid assumed to be the filename. docid = filename with open(os.path.realpath(input_dir + "/" + filename), 'rt') as curr_file: atEOF = False # Check if the file is empty (we are at End of File) curr_pos = curr_file.tell() curr_file.read(1) new_pos = curr_file.tell() if new_pos == curr_pos: atEOF = True else: curr_file.seek(curr_pos) # One iteration of the following loop corresponds to one sentence while not atEOF: sent_id = -1 wordidxs = [] words = [] poses = [] ners = [] lemmas = [] dep_paths = [] dep_parents = [] bounding_boxes = [] curr_line = curr_file.readline().strip() # Sentences are separated by empty lines in the parser output file while curr_line != "": tokens = curr_line.split("\t") if len(tokens) != 9: sys.stderr.write( "ERROR: malformed line (wrong number of fields): {}\n" .format(curr_line)) return 1 word_idx, word, pos, ner, lemma, dep_path, dep_parent, word_sent_id, bounding_box = tokens # Normalize sentence id word_sent_id = int(word_sent_id.replace("SENT_", "")) # assign sentence id if this is the first word of the sentence if sent_id == -1: sent_id = word_sent_id # sanity check for word_sent_id elif sent_id != word_sent_id: sys.stderr.write( "ERROR: found word with mismatching sent_id w.r.t. sentence: {} != {}\n" .format(word_sent_id, sent_id)) return 1 # Normalize bounding box, stripping initial '[' and # final '],' and concatenating components bounding_box = bounding_box[1:-2] bounding_box = bounding_box.replace(", ", "-") # Append contents of this line to the sentence arrays wordidxs.append(int(word_idx) - 1) # Start from 0 words.append(word) poses.append(pos) ners.append(ner) lemmas.append(lemma) dep_paths.append(dep_path) # Now "-1" means root and the rest correspond to array indices dep_parents.append(int(dep_parent) - 1) bounding_boxes.append(bounding_box) # Read the next line curr_line = curr_file.readline().strip() # Write sentence to output if mode == "tsv": out_file.write("{}\n".format("\t".join([ docid, str(sent_id), list2TSVarray(wordidxs), list2TSVarray(words, quote=True), list2TSVarray(poses, quote=True), list2TSVarray(ners), list2TSVarray(lemmas, quote=True), list2TSVarray(dep_paths, quote=True), list2TSVarray(dep_parents), list2TSVarray(bounding_boxes) ]))) elif mode == "json": out_file.write("{}\n".format( json.dumps({ "doc_id": docid, "sent_id": sent_id, "wordidxs": wordidxs, "words": words, "poses": poses, "ners": ners, "lemmas": lemmas, "dep_paths": dep_paths, "dep_parents": dep_parents, "bounding_boxes": bounding_boxes }))) # Check if we are at End of File curr_pos = curr_file.tell() curr_file.read(1) new_pos = curr_file.tell() if new_pos == curr_pos: atEOF = True else: curr_file.seek(curr_pos)
doc_id = line_dict["doc_id"] sent_id = line_dict["sent_id"] words = line_dict["words"] wordidxs = [x for x in range(len(words))] poses = line_dict["poses"] ners = line_dict["ners"] lemmas = line_dict["lemmas"] dep_paths_orig = line_dict["dep_paths"] bounding_boxes = ["empty"] * len(words) gene_index = int(doc_id.split("-")[-1]) # Compute dependency path edge labels and node parents dep_paths = ["_"] * len(words) dep_parents = [0] * len(words) for dep_path in dep_paths_orig: tokens = dep_path.split("(") dep_parent = int((tokens[1].split(", ")[0]).split("-")[-1]) - 1 dep_child = int((tokens[1].split(", ")[-1]).split("-")[-1][:-1]) - 1 dep_paths[dep_child] = tokens[0] dep_parents[dep_child] = dep_parent print("{}".format("\t".join([doc_id, str(sent_id), list2TSVarray(wordidxs), list2TSVarray(words, quote=True), list2TSVarray(poses, quote=True), list2TSVarray(ners), list2TSVarray(lemmas, quote=True), list2TSVarray(dep_paths, quote=True), list2TSVarray(dep_parents), list2TSVarray(bounding_boxes), genes[gene_index]])))
# Check if the definition contains some keywords that # make us suspect that it is probably a gene/protein. # This list is incomplete, and it would be good to add # to it. if contains_kw: continue for word in definition.split(): if word.endswith("ase") and len(word) > 5: contains_kw = True break if " gene" in definition or \ "protein" in definition or \ "factor" in definition or \ "ligand" in definition or \ "enzyme" in definition or \ "receptor" in definition or \ "pseudogene" in definition: contains_kw = True # If no significant keyword in any definition, supervise as not # correct if not contains_kw and not is_correct: is_correct = False is_correct_str = "\\N" if is_correct is not None: is_correct_str = is_correct.__repr__() print("\t".join( (line_dict["doc_id"], acronym, list2TSVarray(list(acronyms[acronym]), quote=True), is_correct_str)))
break else: # Check if the definition contains some keywords that # make us suspect that it is probably a gene/protein. # This list is incomplete, and it would be good to add # to it. if contains_kw: continue for word in definition.split(): if word.endswith("ase") and len(word) > 5: contains_kw = True break if " gene" in definition or \ "protein" in definition or \ "factor" in definition or \ "ligand" in definition or \ "enzyme" in definition or \ "receptor" in definition or \ "pseudogene" in definition: contains_kw = True # If no significant keyword in any definition, supervise as not # correct if not contains_kw and not is_correct: is_correct = False is_correct_str = "\\N" if is_correct is not None: is_correct_str = is_correct.__repr__() print("\t".join((line_dict["doc_id"], acronym, list2TSVarray(list(acronyms[acronym]), quote=True), is_correct_str)))
def process_files(proc_id, input_files, input_dir, output_dir, mode): with open(os.path.realpath("{}/sentences-{}.{}".format(output_dir, proc_id, mode)), 'wt') as out_file: for filename in input_files: # Docid assumed to be the filename. docid = filename with open(os.path.realpath(input_dir + "/" + filename), 'rt') as curr_file: atEOF = False # Check if the file is empty (we are at End of File) curr_pos = curr_file.tell() curr_file.read(1) new_pos = curr_file.tell() if new_pos == curr_pos: atEOF = True else: curr_file.seek(curr_pos) # One iteration of the following loop corresponds to one sentence while not atEOF: sent_id = -1 wordidxs = [] words = [] poses = [] ners = [] lemmas = [] dep_paths = [] dep_parents = [] bounding_boxes = [] curr_line = curr_file.readline().strip() # Sentences are separated by empty lines in the parser output file while curr_line != "": tokens = curr_line.split("\t") if len(tokens) != 9: sys.stderr.write("ERROR: malformed line (wrong number of fields): {}\n".format(curr_line)) return 1 word_idx, word, pos, ner, lemma, dep_path, dep_parent, word_sent_id, bounding_box = tokens # Normalize sentence id word_sent_id = int(word_sent_id.replace("SENT_", "")) # assign sentence id if this is the first word of the sentence if sent_id == -1: sent_id = word_sent_id # sanity check for word_sent_id elif sent_id != word_sent_id: sys.stderr.write("ERROR: found word with mismatching sent_id w.r.t. sentence: {} != {}\n".format(word_sent_id, sent_id)) return 1 # Normalize bounding box, stripping initial '[' and # final '],' and concatenating components bounding_box = bounding_box[1:-2] bounding_box = bounding_box.replace(", ", "-") # Append contents of this line to the sentence arrays wordidxs.append(int(word_idx) - 1) # Start from 0 words.append(word) poses.append(pos) ners.append(ner) lemmas.append(lemma) dep_paths.append(dep_path) # Now "-1" means root and the rest correspond to array indices dep_parents.append(int(dep_parent) - 1) bounding_boxes.append(bounding_box) # Read the next line curr_line = curr_file.readline().strip() # Write sentence to output if mode == "tsv": out_file.write("{}\n".format("\t".join([docid, str(sent_id), list2TSVarray(wordidxs), list2TSVarray(words, quote=True), list2TSVarray(poses, quote=True), list2TSVarray(ners), list2TSVarray(lemmas, quote=True), list2TSVarray(dep_paths, quote=True), list2TSVarray(dep_parents), list2TSVarray(bounding_boxes)]))) elif mode == "json": out_file.write("{}\n".format(json.dumps({ "doc_id": docid, "sent_id": sent_id, "wordidxs": wordidxs, "words": words, "poses": poses, "ners": ners, "lemmas": lemmas, "dep_paths": dep_paths, "dep_parents": dep_parents, "bounding_boxes": bounding_boxes}))) # Check if we are at End of File curr_pos = curr_file.tell() curr_file.read(1) new_pos = curr_file.tell() if new_pos == curr_pos: atEOF = True else: curr_file.seek(curr_pos)