Ejemplo n.º 1
0
 def tsv_dump(self):
     is_correct_str = "\\N"
     if self.is_correct is not None:
         is_correct_str = self.is_correct.__repr__()
     tsv_line = "\t".join(
         ["\\N", self.doc_id, str(self.sent_id),
             list2TSVarray(self.wordidxs), self.id(), self.type,
             self.entity, list2TSVarray(self.words, quote=True),
             is_correct_str, list2TSVarray(list(self.features), True)])
     return tsv_line
Ejemplo n.º 2
0
 def tsv_dump(self):
     is_correct_str = "\\N"
     if self.is_correct is not None:
         is_correct_str = self.is_correct.__repr__()
     tsv_line = "\t".join([
         "\\N", self.doc_id,
         str(self.sent_id),
         list2TSVarray(self.wordidxs),
         self.id(), self.type, self.entity,
         list2TSVarray(self.words, quote=True), is_correct_str
     ])
     return tsv_line
Ejemplo n.º 3
0
 def tsv_dump(self):
     is_correct_str = "\\N"
     if self.is_correct is not None:
         is_correct_str = self.is_correct.__repr__()
     return "\t".join(
         ["\\N", self.doc_id, str(self.sent_id_1), str(self.sent_id_2),
             self.id(), self.type, self.mention_1_id, self.mention_2_id,
             list2TSVarray([x.in_sent_idx for x in self.mention_1_words]),
             list2TSVarray([x.in_sent_idx for x in self.mention_2_words]),
             list2TSVarray([x.word for x in self.mention_1_words], True),
             list2TSVarray([x.word for x in self.mention_2_words], True),
             is_correct_str])
Ejemplo n.º 4
0
 def tsv_dump(self):
     is_correct_str = "\\N"
     if self.is_correct is not None:
         is_correct_str = self.is_correct.__repr__()
     return "\t".join([
         "\\N", self.doc_id,
         str(self.sent_id_1),
         str(self.sent_id_2),
         self.id(), self.type, self.mention_1_id, self.mention_2_id,
         list2TSVarray([x.in_sent_idx for x in self.mention_1_words]),
         list2TSVarray([x.in_sent_idx for x in self.mention_2_words]),
         list2TSVarray([x.word for x in self.mention_1_words], True),
         list2TSVarray([x.word for x in self.mention_2_words], True),
         is_correct_str
     ])
Ejemplo n.º 5
0
def process_files(proc_id, input_files, input_dir, output_dir, mode):
    with open(
            os.path.realpath("{}/sentences-{}.{}".format(
                output_dir, proc_id, mode)), 'wt') as out_file:
        for filename in input_files:
            # Docid assumed to be the filename.
            docid = filename
            with open(os.path.realpath(input_dir + "/" + filename),
                      'rt') as curr_file:
                atEOF = False
                # Check if the file is empty (we are at End of File)
                curr_pos = curr_file.tell()
                curr_file.read(1)
                new_pos = curr_file.tell()
                if new_pos == curr_pos:
                    atEOF = True
                else:
                    curr_file.seek(curr_pos)
                # One iteration of the following loop corresponds to one sentence
                while not atEOF:
                    sent_id = -1
                    wordidxs = []
                    words = []
                    poses = []
                    ners = []
                    lemmas = []
                    dep_paths = []
                    dep_parents = []
                    bounding_boxes = []
                    curr_line = curr_file.readline().strip()
                    # Sentences are separated by empty lines in the parser output file
                    while curr_line != "":
                        tokens = curr_line.split("\t")
                        if len(tokens) != 9:
                            sys.stderr.write(
                                "ERROR: malformed line (wrong number of fields): {}\n"
                                .format(curr_line))
                            return 1
                        word_idx, word, pos, ner, lemma, dep_path, dep_parent, word_sent_id, bounding_box = tokens
                        # Normalize sentence id
                        word_sent_id = int(word_sent_id.replace("SENT_", ""))
                        # assign sentence id if this is the first word of the sentence
                        if sent_id == -1:
                            sent_id = word_sent_id
                        # sanity check for word_sent_id
                        elif sent_id != word_sent_id:
                            sys.stderr.write(
                                "ERROR: found word with mismatching sent_id w.r.t. sentence: {} != {}\n"
                                .format(word_sent_id, sent_id))
                            return 1
                        # Normalize bounding box, stripping initial '[' and
                        # final '],' and concatenating components
                        bounding_box = bounding_box[1:-2]
                        bounding_box = bounding_box.replace(", ", "-")
                        # Append contents of this line to the sentence arrays
                        wordidxs.append(int(word_idx) - 1)  # Start from 0
                        words.append(word)
                        poses.append(pos)
                        ners.append(ner)
                        lemmas.append(lemma)
                        dep_paths.append(dep_path)
                        # Now "-1" means root and the rest correspond to array indices
                        dep_parents.append(int(dep_parent) - 1)
                        bounding_boxes.append(bounding_box)
                        # Read the next line
                        curr_line = curr_file.readline().strip()
                    # Write sentence to output
                    if mode == "tsv":
                        out_file.write("{}\n".format("\t".join([
                            docid,
                            str(sent_id),
                            list2TSVarray(wordidxs),
                            list2TSVarray(words, quote=True),
                            list2TSVarray(poses, quote=True),
                            list2TSVarray(ners),
                            list2TSVarray(lemmas, quote=True),
                            list2TSVarray(dep_paths, quote=True),
                            list2TSVarray(dep_parents),
                            list2TSVarray(bounding_boxes)
                        ])))
                    elif mode == "json":
                        out_file.write("{}\n".format(
                            json.dumps({
                                "doc_id": docid,
                                "sent_id": sent_id,
                                "wordidxs": wordidxs,
                                "words": words,
                                "poses": poses,
                                "ners": ners,
                                "lemmas": lemmas,
                                "dep_paths": dep_paths,
                                "dep_parents": dep_parents,
                                "bounding_boxes": bounding_boxes
                            })))
                    # Check if we are at End of File
                    curr_pos = curr_file.tell()
                    curr_file.read(1)
                    new_pos = curr_file.tell()
                    if new_pos == curr_pos:
                        atEOF = True
                    else:
                        curr_file.seek(curr_pos)
Ejemplo n.º 6
0
        doc_id = line_dict["doc_id"]
        sent_id = line_dict["sent_id"]
        words = line_dict["words"]
        wordidxs = [x for x in range(len(words))]
        poses = line_dict["poses"]
        ners = line_dict["ners"]
        lemmas = line_dict["lemmas"]
        dep_paths_orig = line_dict["dep_paths"]
        bounding_boxes = ["empty"] * len(words)

        gene_index = int(doc_id.split("-")[-1])

        # Compute dependency path edge labels and node parents
        dep_paths = ["_"] * len(words)
        dep_parents = [0] * len(words)
        for dep_path in dep_paths_orig:
            tokens = dep_path.split("(")
            dep_parent = int((tokens[1].split(", ")[0]).split("-")[-1]) - 1
            dep_child = int((tokens[1].split(", ")[-1]).split("-")[-1][:-1]) - 1
            dep_paths[dep_child] = tokens[0]
            dep_parents[dep_child] = dep_parent

        print("{}".format("\t".join([doc_id, str(sent_id),
            list2TSVarray(wordidxs), list2TSVarray(words,
                quote=True), list2TSVarray(poses, quote=True),
            list2TSVarray(ners), list2TSVarray(lemmas, quote=True),
            list2TSVarray(dep_paths, quote=True),
            list2TSVarray(dep_parents),
            list2TSVarray(bounding_boxes), genes[gene_index]])))

Ejemplo n.º 7
0
                        # Check if the definition contains some keywords that
                        # make us suspect that it is probably a gene/protein.
                        # This list is incomplete, and it would be good to add
                        # to it.
                        if contains_kw:
                            continue
                        for word in definition.split():
                            if word.endswith("ase") and len(word) > 5:
                                contains_kw = True
                                break
                        if " gene" in definition or \
                                "protein" in definition or \
                                "factor" in definition or \
                                "ligand" in definition or \
                                "enzyme" in definition or \
                                "receptor" in definition or \
                                "pseudogene" in definition:
                            contains_kw = True
                # If no significant keyword in any definition, supervise as not
                # correct
                if not contains_kw and not is_correct:
                    is_correct = False
                is_correct_str = "\\N"
                if is_correct is not None:
                    is_correct_str = is_correct.__repr__()
                print("\t".join(
                    (line_dict["doc_id"], acronym,
                    list2TSVarray(list(acronyms[acronym]), quote=True),
                    is_correct_str)))

Ejemplo n.º 8
0
                        break
                    else:
                        # Check if the definition contains some keywords that
                        # make us suspect that it is probably a gene/protein.
                        # This list is incomplete, and it would be good to add
                        # to it.
                        if contains_kw:
                            continue
                        for word in definition.split():
                            if word.endswith("ase") and len(word) > 5:
                                contains_kw = True
                                break
                        if " gene" in definition or \
                                "protein" in definition or \
                                "factor" in definition or \
                                "ligand" in definition or \
                                "enzyme" in definition or \
                                "receptor" in definition or \
                                "pseudogene" in definition:
                            contains_kw = True
                # If no significant keyword in any definition, supervise as not
                # correct
                if not contains_kw and not is_correct:
                    is_correct = False
                is_correct_str = "\\N"
                if is_correct is not None:
                    is_correct_str = is_correct.__repr__()
                print("\t".join((line_dict["doc_id"], acronym,
                                 list2TSVarray(list(acronyms[acronym]),
                                               quote=True), is_correct_str)))
Ejemplo n.º 9
0
def process_files(proc_id, input_files, input_dir, output_dir, mode):
    with open(os.path.realpath("{}/sentences-{}.{}".format(output_dir, proc_id, mode)), 'wt') as out_file:
        for filename in input_files:
            # Docid assumed to be the filename.
            docid = filename
            with open(os.path.realpath(input_dir + "/" + filename), 'rt') as curr_file:
                atEOF = False
                # Check if the file is empty (we are at End of File)
                curr_pos = curr_file.tell()
                curr_file.read(1)
                new_pos = curr_file.tell()
                if new_pos == curr_pos:
                    atEOF = True
                else:
                    curr_file.seek(curr_pos)
                # One iteration of the following loop corresponds to one sentence
                while not atEOF: 
                    sent_id = -1
                    wordidxs = []
                    words = []
                    poses = []
                    ners = []
                    lemmas = []
                    dep_paths = []
                    dep_parents = []
                    bounding_boxes = []
                    curr_line = curr_file.readline().strip()
                    # Sentences are separated by empty lines in the parser output file
                    while curr_line != "":
                        tokens = curr_line.split("\t")
                        if len(tokens) != 9:
                            sys.stderr.write("ERROR: malformed line (wrong number of fields): {}\n".format(curr_line))
                            return 1
                        word_idx, word, pos, ner, lemma, dep_path, dep_parent, word_sent_id, bounding_box = tokens 
                        # Normalize sentence id
                        word_sent_id = int(word_sent_id.replace("SENT_", ""))
                        # assign sentence id if this is the first word of the sentence
                        if sent_id == -1:
                            sent_id = word_sent_id
                        # sanity check for word_sent_id
                        elif sent_id != word_sent_id:
                            sys.stderr.write("ERROR: found word with mismatching sent_id w.r.t. sentence: {} != {}\n".format(word_sent_id, sent_id))
                            return 1
                        # Normalize bounding box, stripping initial '[' and
                        # final '],' and concatenating components
                        bounding_box = bounding_box[1:-2]
                        bounding_box = bounding_box.replace(", ", "-")
                        # Append contents of this line to the sentence arrays
                        wordidxs.append(int(word_idx) - 1) # Start from 0
                        words.append(word) 
                        poses.append(pos)
                        ners.append(ner)
                        lemmas.append(lemma)
                        dep_paths.append(dep_path)
                        # Now "-1" means root and the rest correspond to array indices
                        dep_parents.append(int(dep_parent) - 1) 
                        bounding_boxes.append(bounding_box)
                        # Read the next line
                        curr_line = curr_file.readline().strip()
                    # Write sentence to output
                    if mode == "tsv":
                        out_file.write("{}\n".format("\t".join([docid, str(sent_id),
                            list2TSVarray(wordidxs), list2TSVarray(words,
                                quote=True), list2TSVarray(poses, quote=True),
                            list2TSVarray(ners), list2TSVarray(lemmas, quote=True),
                            list2TSVarray(dep_paths, quote=True),
                            list2TSVarray(dep_parents),
                            list2TSVarray(bounding_boxes)])))
                    elif mode == "json":
                        out_file.write("{}\n".format(json.dumps({ "doc_id": docid, "sent_id": sent_id,
                            "wordidxs": wordidxs, "words": words, "poses": poses,
                            "ners": ners, "lemmas": lemmas, "dep_paths": dep_paths,
                            "dep_parents": dep_parents, "bounding_boxes":
                            bounding_boxes})))
                    # Check if we are at End of File
                    curr_pos = curr_file.tell()
                    curr_file.read(1)
                    new_pos = curr_file.tell()
                    if new_pos == curr_pos:
                        atEOF = True
                    else:
                        curr_file.seek(curr_pos)