def match(self, tree):
        try:
            if tree.label() != 'ROOT':
                raise IndexError
            if tree[0].label() != 'SBARQ':
                raise IndexError
            if tree[0][0][0].label() != 'WRB':
                raise IndexError
            if tree[0][0][0][0].lower() != 'when':
                raise IndexError
            if tree[0][1].label() != 'SQ':
                raise IndexError
            if tree[0][1][0].label() != 'VBD':
                raise IndexError
            if tree[0][1][1].label() != 'NP':
                raise IndexError
            if tree[0][1][2].label() != 'VP':
                raise IndexError

            part = Pattern.Part()
            part.object = ParentedTree.fromstring(str(tree[0][1][1]))
            part.property = ParentedTree.fromstring(str(Tree('VP', [
                Tree.fromstring(str(tree[0][0][0])),
                Tree.fromstring(str(tree[0][1][0])),
                Tree.fromstring(str(tree[0][1][2]))
            ])))

            return [part]
        except IndexError:
            return []
Beispiel #2
0
    def walker(self, parent):
        if parent.label() == 'IN' and parent.leaves() == ["of"]:
            pos = parent.parent().treeposition()

            part = Pattern.Part()
            part.object = ParentedTree.fromstring(str(parent.right_sibling()))
            part.property = ParentedTree.fromstring(str(self.get_query_tree()))
            part.property[pos[:-1]].remove(part.property[pos])
            self._parts.append(part)

        for child in parent:
            if isinstance(child, ParentedTree):
                self.walker(child)
    def walker(self, parent):
        if parent.label() == 'IN' and parent.leaves() == ["of"]:
            pos = parent.parent().treeposition()

            part = Pattern.Part()
            part.object = ParentedTree.fromstring(str(parent.right_sibling()))
            part.property = ParentedTree.fromstring(str(self.get_query_tree()))
            part.property[pos[:-1]].remove(part.property[pos])
            self._parts.append(part)

        for child in parent:
            if isinstance(child, ParentedTree):
                self.walker(child)
Beispiel #4
0
    def run(self, args):
        input_text = args["input"]["text"][0]
        input_parse = args["input"]["parse"][0]
        output_parse = args["output"]["parse"][0]
        doc_list = args["doc_list"]
        tuples = self.get_io_files([input_text, input_parse, output_parse], doc_list)

        for files in tuples:
            indexed_parses = []
            in_text_file, in_parse_file, out_parse_file = files

            text = self.read_file(in_text_file)

            parses = self.read_file(in_parse_file)
            parses = json.loads(parses)

            leafreader = LeafReader(text)

            for parse in parses:
                tree = ParentedTree.fromstring(parse, read_leaf=leafreader.read_leaf)
                indexed_parses.append(tree.pprint(margin=float("inf")))

            # in-json parses
            output = json.dumps(indexed_parses)

            self.write_file(output, out_parse_file)
Beispiel #5
0
    def match(self, *args, **kwargs):
        Pattern.match(self, *args, **kwargs)
        try:
            if self.get_query_tree().label() != "ROOT":
                raise IndexError

            if self.get_query_tree()[0].label() != "SBARQ":
                raise IndexError

            if self.get_query_tree()[0][0].label() != "WHNP":
                raise IndexError

            if self.get_query_tree()[0][0][0].label() != "WP":
                raise IndexError

            if self.get_query_tree()[0][0][0][0].lower() != self._keyword:
                raise IndexError

            if self.get_query_tree()[0][1].label() != "SQ":
                raise IndexError

            if len(self.get_query_tree()[0][1]) < 2:
                raise IndexError

            part = Pattern.Part()
            part.object = ParentedTree.fromstring(
                str(self.get_query_tree()[0][1][1]))
            self._parts.append(part)

            return self._parts

        except IndexError:
            return []
Beispiel #6
0
def extract_independent_clauses(input_sent, predictor):
    output = predictor.predict(sentence=input_sent)
    tree_str = output["trees"]
    t = ParentedTree.fromstring(tree_str)
    candidate_nodes = list(t.subtrees(filter=lambda x: filt_r(x) or filt_l(x)))
    for node in candidate_nodes:
        if node.parent() in candidate_nodes:
            candidate_nodes.remove(node.parent())
    sub_sentences = []
    for candidate in candidate_nodes:
        temp = []
        for subtree in candidate:
            temp += subtree.leaves()
        sub_sentences.append(temp)
    sub_sentences = sub_sentences if sub_sentences else [t.leaves()]
    sentences = []
    for sentence in sub_sentences:
        temp = ""
        for i, word in enumerate(sentence):
            if i == 0:
                temp += word[0].title() + word[1:]
            elif word in [".", "!", "?", ",", ";"]:
                temp += word
            else:
                temp += " " + word
        temp = temp.replace(" ’", "’")
        temp = temp.replace(" n’", "n’")
        sentences.append(temp)
    return sentences
Beispiel #7
0
    def match(self, *args, **kwargs):
        Pattern.match(self, *args, **kwargs)
        try:
            if self.get_query_tree().label() != "ROOT":
                raise IndexError

            if self.get_query_tree()[0].label() != "SBARQ":
                raise IndexError

            if self.get_query_tree()[0][0].label() != "WHNP":
                raise IndexError

            if self.get_query_tree()[0][0][0].label() != "WP":
                raise IndexError

            if self.get_query_tree()[0][0][0][0].lower() != self._keyword:
                raise IndexError

            if self.get_query_tree()[0][1].label() != "SQ":
                raise IndexError

            if len(self.get_query_tree()[0][1]) < 2:
                raise IndexError

            part = Pattern.Part()
            part.object = ParentedTree.fromstring(str(self.get_query_tree()[0][1][1]))
            self._parts.append(part)

            return self._parts

        except IndexError:
            return []
def replace_terminals_with_indices(treestring):
    ''' Replaces each terminal in the tree read from a string with an index in the sentence '''
    tree = ParentedTree.fromstring(treestring)
    for idx, _ in enumerate(tree.leaves()):
        tree_location = tree.leaf_treeposition(idx)
        non_terminal = tree[tree_location[:-1]]
        non_terminal[0] = str(idx)
    return tree
Beispiel #9
0
def generate(sent, synt, tmpls, synpg_model, pg_model, args):
    with torch.no_grad():

        # convert syntax to tag sequence
        tagss = np.zeros((len(tmpls), args.max_sent_len), dtype=np.long)
        tags_ = ParentedTree.fromstring(synt)
        tags_ = getleaf(tags_)
        tags_ = [
            dictionary.word2idx[f"<{w}>"] for w in tags_
            if f"<{w}>" in dictionary.word2idx
        ]
        tagss[:, :len(tags_)] = tags_[:args.max_sent_len]

        tagss = torch.from_numpy(tagss).cuda()

        # generate parses from tag sequence and templates
        parse_idxs = pg_model.generate(tagss,
                                       tmpls,
                                       args.max_synt_len,
                                       temp=args.temp)

        # add <sos> and remove tokens after <eos>
        synts = np.zeros((len(tmpls), args.max_synt_len + 2), dtype=np.long)
        synts[:, 0] = 1

        for i in range((len(tmpls))):
            parse_idx = parse_idxs[i].cpu().numpy()
            eos_pos = np.where(parse_idx == dictionary.word2idx["<eos>"])[0]
            eos_pos = eos_pos[0] + 1 if len(eos_pos) > 0 else len(idx)
            synts[i, 1:eos_pos + 1] = parse_idx[:eos_pos]

        synts = torch.from_numpy(synts).cuda()

        # bpe segment and convert sentence to tensor
        sents = np.zeros((len(tmpls), args.max_sent_len), dtype=np.long)
        sent_ = bpe.segment(sent).split()
        sent_ = [
            dictionary.word2idx[w]
            if w in dictionary.word2idx else dictionary.word2idx["<unk>"]
            for w in sent_
        ]
        sents[:, :len(sent_)] = sent_[:args.max_sent_len]
        sents = torch.from_numpy(sents).cuda()

        # generate paraphrases from sentence and generated parses
        output_idxs = synpg_model.generate(sents,
                                           synts,
                                           args.max_sent_len,
                                           temp=args.temp)
        output_idxs = output_idxs.cpu().numpy()

        paraphrases = [
            reverse_bpe(synt2str(output_idxs[i], dictionary).split())
            for i in range(len(tmpls))
        ]

        return paraphrases
Beispiel #10
0
def generate(model, data, loader, dictionary, bpe, args):
    model.eval()
    with open(os.path.join(args.output_dir, f"target_sents.txt"), "w") as fp1, \
         open(os.path.join(args.output_dir, f"target_synts.txt"), "w") as fp2, \
         open(os.path.join(args.output_dir, f"outputs.txt"), "w") as fp3:
        with torch.no_grad():
            iterator = tqdm(loader, total=len(loader))
            for it, data_idxs in enumerate(iterator):
                data_idxs = data_idxs.numpy()

                sents_ = data[0][data_idxs]  # sents1
                targs_ = data[1][data_idxs]  # sents2
                synts_ = data[3][data_idxs]  # synts2

                batch_size = len(sents_)
                sents = np.zeros((batch_size, args.max_sent_len),
                                 dtype=np.long)
                synts = np.zeros((batch_size, args.max_synt_len + 2),
                                 dtype=np.long)

                for i in range(batch_size):
                    sent_ = sents_[i]
                    sent_ = bpe.segment(sent_).split()
                    sent_ = [
                        dictionary.word2idx[w] if w in dictionary.word2idx else
                        dictionary.word2idx["<unk>"] for w in sent_
                    ]
                    sents[i, :len(sent_)] = sent_

                    synt_ = synts_[i]
                    synt_ = ParentedTree.fromstring(synt_)
                    synt_ = deleaf(synt_)
                    synt_ = [
                        dictionary.word2idx[f"<{w}>"] for w in synt_
                        if f"<{w}>" in dictionary.word2idx
                    ]
                    synt_ = [dictionary.word2idx["<sos>"]
                             ] + synt_ + [dictionary.word2idx["<eos>"]]
                    synts[i, :len(synt_)] = synt_

                sents = torch.from_numpy(sents).cuda()
                synts = torch.from_numpy(synts).cuda()

                idxs = model.generate(sents,
                                      synts,
                                      sents.size(1),
                                      sample=args.sample,
                                      temp=args.temp)

                for sent, idx, targ, synt_ in zip(sents_,
                                                  idxs.cpu().numpy(), targs_,
                                                  synts_):
                    fp1.write(targ + '\n')
                    fp2.write(synt_ + '\n')
                    fp3.write(
                        reverse_bpe(synt2str(idx, dictionary).split()) + '\n')
Beispiel #11
0
def evaluate(model, data, loader, criterion, dictionary, bpe, args):
    model.eval()
    total_loss = 0.0
    max_it = len(loader)
    with torch.no_grad():
        for it, data_idxs in enumerate(loader):
            data_idxs = np.sort(data_idxs.numpy())
            
            # get batch of raw sentences and raw syntax
            sents_ = data[0][data_idxs]
            synts_ = data[1][data_idxs]

            batch_size = len(sents_)
            
            # initialize tensors
            sents = np.zeros((batch_size, args.max_sent_len), dtype=np.long)    # words without position
            synts = np.zeros((batch_size, args.max_synt_len+2), dtype=np.long)  # syntax
            targs = np.zeros((batch_size, args.max_sent_len+2), dtype=np.long)  # target output

            for i in range(batch_size):
                
                # bpe segment and convert to tensor
                sent_ = sents_[i]
                sent_ = bpe.segment(sent_).split()
                sent_ = [dictionary.word2idx[w] if w in dictionary.word2idx else dictionary.word2idx["<unk>"] for w in sent_]
                sents[i, :len(sent_)] = sent_
                
                # add <sos> and <eos> for target output
                targ_ = [dictionary.word2idx["<sos>"]] + sent_ + [dictionary.word2idx["<eos>"]]
                targs[i, :len(targ_)] = targ_
                
                # parse syntax and convert to tensor
                synt_ = synts_[i]
                synt_ = ParentedTree.fromstring(synt_)
                synt_ = deleaf(synt_)
                synt_ = [dictionary.word2idx[f"<{w}>"] for w in synt_ if f"<{w}>" in dictionary.word2idx]
                synt_ = [dictionary.word2idx["<sos>"]] + synt_ + [dictionary.word2idx["<eos>"]]
                synts[i, :len(synt_)] = synt_

            sents = torch.from_numpy(sents).cuda()
            synts = torch.from_numpy(synts).cuda()
            targs = torch.from_numpy(targs).cuda()
            
            # forward
            outputs = model(sents, synts, targs)
            
            # calculate loss
            targs_ = targs[:, 1:].contiguous().view(-1)
            outputs_ = outputs.contiguous().view(-1, outputs.size(-1))
            loss = criterion(outputs_, targs_)
        
            total_loss += loss.item()
    
    return total_loss / max_it
Beispiel #12
0
def generate(epoch, eit, model, data, loader, dictionary, bpe, args, max_it=10):
    model.eval()
    with open(os.path.join(args.output_dir, "sents_valid_epoch{:02d}_it{:06d}.txt".format(epoch, eit)), "w") as fp:
        with torch.no_grad():
            for it, data_idxs in enumerate(loader):
                if it >= max_it:
                    break
                
                data_idxs = np.sort(data_idxs.numpy())
                
                # get batch of raw sentences and raw syntax
                sents_ = data[0][data_idxs]
                synts_ = data[1][data_idxs]

                batch_size = len(sents_)
                
                # initialize tensors
                sents = np.zeros((batch_size, args.max_sent_len), dtype=np.long)    # words without position
                synts = np.zeros((batch_size, args.max_synt_len+2), dtype=np.long)  # syntax
                targs = np.zeros((batch_size, args.max_sent_len+2), dtype=np.long)  # target output

                for i in range(batch_size):
                    
                    # bpe segment and convert to tensor
                    sent_ = sents_[i]
                    sent_ = bpe.segment(sent_).split()
                    sent_ = [dictionary.word2idx[w] if w in dictionary.word2idx else dictionary.word2idx["<unk>"] for w in sent_]
                    sents[i, :len(sent_)] = sent_
                    
                    # add <sos> and <eos> for target output
                    targ_ = [dictionary.word2idx["<sos>"]] + sent_ + [dictionary.word2idx["<eos>"]]
                    targs[i, :len(targ_)] = targ_
                    
                    # parse syntax and convert to tensor
                    synt_ = synts_[i]
                    synt_ = ParentedTree.fromstring(synt_)
                    synt_ = deleaf(synt_)
                    synt_ = [dictionary.word2idx[f"<{w}>"] for w in synt_ if f"<{w}>" in dictionary.word2idx]
                    synt_ = [dictionary.word2idx["<sos>"]] + synt_ + [dictionary.word2idx["<eos>"]]
                    synts[i, :len(synt_)] = synt_
            
                sents = torch.from_numpy(sents).cuda()
                synts = torch.from_numpy(synts).cuda()
                targs = torch.from_numpy(targs).cuda()
                
                # generate
                idxs = model.generate(sents, synts, sents.size(1), temp=args.temp)
                
                # write output
                for sent, idx, synt in zip(sents.cpu().numpy(), idxs.cpu().numpy(), synts.cpu().numpy()):
                    fp.write(synt2str(synt[1:], dictionary)+'\n')
                    fp.write(sent2str(sent, dictionary)+'\n')
                    fp.write(synt2str(idx, dictionary)+'\n')
                    fp.write("--\n")
Beispiel #13
0
def process_tree(tree_str, label):
    example = None
    try:
        #print("getting tree")
        tree = ParentedTree.fromstring(tree_str.__str__())
        #print("before get_relation")
        example = get_relation(tree, label)
        #print(example.e1)
    except ValueError as err:
        #print(err)
        pass
    return example
Beispiel #14
0
def parse_parented_tree(tree_string):
    """ Construct a tree from a constituent parse tree string.

    Args:
        tree_string (str): A constituent parse tree in bracket notation

    Returns:
        nltk.ParentedTree: A parse tree corresponding to the parse tree string.
    """
    try:
        return ParentedTree(tree_string)
    except TypeError:
        return ParentedTree.fromstring(tree_string)
Beispiel #15
0
 def __init__(self, filename, postagged='./data/postagged-files',
              parsed='./data/parsed-files',
              dependency='./data/dep-files'):
     self.filename = filename
     postagged_file = os.path.join(postagged, filename+'.tag')
     parsed_file = os.path.join(parsed, filename+'.parse')
     dep_file = os.path.join(dependency, filename+'.parse.dep')
     self.tagged_sents = [x.strip() for x in open(postagged_file) if x.strip()]
     self.parsed_sents = [ParentedTree.fromstring(x) for x in open(parsed_file) if x.strip()]
     self.dep_sents = [DepTree.fromstring(x)
                       for x in open(dep_file).read().strip().split('\n\n')
                       if x.strip()]
     assert len(self.tagged_sents) == len(self.parsed_sents)
Beispiel #16
0
def parse2edus(parse):
    """
    将成分句法树切割为 EDU
    :param parse: 成分句法树 Bracket 格式文本, e.g. '( (IP (NP (PN 我)) (VP (VV 爱) (NP (NR 北京) (NR 天安门)))))'
    :return: structure.tree.EDU 生成器
    """
    pipeline = get_pipeline()
    segmenter = pipeline.segmenter
    parse = ParentedTree.fromstring(parse)
    childs = list(parse.subtrees(lambda t: t.height() == 2 and t.label() != '-NONE-'))
    text = ''.join([child[0] for child in childs])
    sentence = Sentence((0, len(text)), text, parse=parse)
    return segmenter.cut_edu(sentence)
Beispiel #17
0
def read_file(file_):
    """
    """

    trees = []

    with open(file_, "r", encoding="utf-8") as f:
        lines = f.readlines()

    for line in lines:
        tree = ParentedTree.fromstring(line)
        trees.append(tree)

    return trees
Beispiel #18
0
def template2tensor(templates, max_tmpl_len, dictionary):
    tmpls = np.zeros((len(templates), max_tmpl_len + 2), dtype=np.long)
    for i, tp in enumerate(templates):
        tmpl_ = ParentedTree.fromstring(tp)
        tree2tmpl(tmpl_, 1, 2)
        tmpl_ = str(tmpl_).replace(")", " )").replace("(", "( ").split(" ")
        tmpl_ = [
            dictionary.word2idx[f"<{w}>"] for w in tmpl_
            if f"<{w}>" in dictionary.word2idx
        ]
        tmpl_ = [dictionary.word2idx["<sos>"]
                 ] + tmpl_ + [dictionary.word2idx["<eos>"]]
        tmpls[i, :len(tmpl_)] = tmpl_

    tmpls = torch.from_numpy(tmpls).cuda()

    return tmpls
Beispiel #19
0
def file_to_trees(filename):
    """Reads the parse trees in the given file and returns them as a list of
    ParentedTree objects.

    A depth attribute is added for each node in the full tree.

    Args:
        filename: The base name of the input document.
    Returns:
        A list of ParentedTree objects.
    """
    tree_filepath = 'data/parsed-files/' + filename + '.head.rel.tokenized.raw.parse'
    sent_trees = []
    with open(tree_filepath) as tree_file:
        for line in tree_file:
            if not line.startswith('\n'):
                tree = ParentedTree.fromstring(line)
                add_depth(tree)
                sent_trees.append(tree)
    return sent_trees
 def cut_edu(self, sent: Sentence) -> List[EDU]:
     if not hasattr(sent, "parse"):
         print(sent.text)
         parse = self.parser.parse(sent.text)
     else:
         parse = getattr(sent, "parse")
     parse = ParentedTree.fromstring(parse.pformat())
     children = list(
         parse.subtrees(
             lambda t: t.height() == 2 and t.label() != '-NONE-'))
     edus = []
     last_edu_words = []
     last_edu_tags = []
     offset = 0
     for child in children:
         if child[0] == '-LRB-':
             child[0] = '('
         if child[0] == '-RRB-':
             child[0] = ')'
         last_edu_words.append(child[0])
         last_edu_tags.append(child.label())
         if child[0] in self._eos or (child[0] in self.candidate and
                                      self.model.predict(offset, parse)):
             text = "".join(last_edu_words)
             edu = EDU([TEXT(text)])
             setattr(edu, "words", last_edu_words)
             setattr(edu, "tags", last_edu_tags)
             edus.append(edu)
             last_edu_words = []
             last_edu_tags = []
         offset += len(child[0])
     if last_edu_words:
         text = "".join(last_edu_words)
         edu = EDU([TEXT(text)])
         setattr(edu, "words", last_edu_words)
         setattr(edu, "tags", last_edu_tags)
         edus.append(edu)
     return edus
Beispiel #21
0
def get_phrases(sentence):
    chunking_temp = []
    chunking = []
    sent_tagged = pos_tag(sentence)

    res = get_tregex(sentence, 'ROOT')
    if (res):
        tree = ParentedTree.fromstring(res['0']['match'])

        print(tree.pretty_print())
        list_removed = []

        traverse_tree(tree, chunking_temp)
        print(chunking_temp)
        #fix overlap string
        for x in range(0, len(chunking_temp)):
            p_x, tagged_x = chunking_temp[x]
            for y in range(x + 1, len(chunking_temp)):
                p_y = chunking_temp[y][0]

                if (tagged_x == 'VP' or tagged_x == 'S' or tagged_x == 'SBAR'):
                    #do intersection
                    p_x = p_x.replace(p_y, '').strip()
                else:
                    if p_y in p_x and p_y not in list_removed:
                        list_removed.append(p_y)

            #chunking_temp[x][0] = re.sub(r"  ", " ", chunking_temp[x][0])
            #if(chunking_temp[x][0] != ''):
            if (p_x not in list_removed and p_x != ''):
                tags = []
                for s in sent_tagged:
                    if s[0] in p_x:
                        tags.append(s[1])
                chunking.append((p_x, tagged_x, ' '.join(tags)))
    return chunking
def batch_extract(file):
    '''
    extract the isolate event from the sentences in the txt files
    file: txt with (ROOT (S ...) ) sentences
    '''
    tree = ""
    events = []
    # open the input file with the sentences
    with open(file, "r") as file:
        # read a sentence
        sentence = file.readline()
        while sentence:
            sentence = sentence.strip("\n")
            try:
                # make a tree from the sentence
                tree = ParentedTree.fromstring(sentence)
                event = extract_isolate(tree)
                events.append(event)

            except Exception as e:
                pass

            sentence = file.readline()
    return events
Beispiel #23
0
def toNLTKtree(str):
    newTree = ParentedTree.fromstring(str)
    return newTree
Beispiel #24
0
def parse_indexify_transformations(in_p, out_p, label_voc, args):

    in_trimmed_seqs = []
    in_seqs = []
    out_trimmed_seqs = []
    out_seqs = []

    max_trans_size = 0
    for idx in range(len(in_p)):

        # very rarely, a tree is invalid
        try:
            in_trimmed = ParentedTree.fromstring(in_p[idx])
            in_orig = ParentedTree.fromstring(in_p[idx])
            out_trimmed = ParentedTree.fromstring(out_p[idx])
            out_orig = ParentedTree.fromstring(out_p[idx])
        except:
            continue

        out_dh = parse_tree_level_dropout(out_trimmed, args.tree_level_dropout)
        parse_tree_level_dropout(in_trimmed, args.tree_level_dropout, level=out_dh)

        in_orig = deleaf(in_orig)
        in_trimmed = deleaf(in_trimmed)
        out_orig = deleaf(out_orig)
        out_trimmed = deleaf(out_trimmed)

        if max_trans_size < len(in_orig):
            max_trans_size = len(in_orig)
        if max_trans_size < len(out_orig):
            max_trans_size = len(out_orig)

        # only consider instances where top-level of input parse != top-level output
        if in_trimmed != out_trimmed:
            # make sure everything is invocab
            try:             
                x = [label_voc[z] for z in in_orig]
                x = [label_voc[z] for z in out_orig]
                in_seqs.append(in_orig)
                out_seqs.append(out_orig)
                out_trimmed_seqs.append(out_trimmed)
                in_trimmed_seqs.append(in_trimmed)
            except:
                pass

    # no syntactic transformations in the batch!
    if len(in_seqs) == 0:
        return None

    # otherwise, indexify and return
    else:
        in_trans_np = np.zeros((len(in_seqs), max_trans_size), dtype='int32')
        out_trans_np = np.zeros((len(in_seqs), max_trans_size), dtype='int32')
        in_trimmed_np = np.zeros((len(in_seqs), max_trans_size), dtype='int32')
        out_trimmed_np = np.zeros((len(in_seqs), max_trans_size), dtype='int32')

        in_lengths = []
        out_lengths = []
        out_trimmed_lengths = []
        in_trimmed_lengths = []
        for idx in range(len(in_seqs)):
            curr_in = in_seqs[idx]
            in_trans_np[idx, :len(curr_in)] = [label_voc[z] for z in curr_in]
            in_lengths.append(len(curr_in))

            curr_out = out_seqs[idx]
            out_trans_np[idx, :len(curr_out)] = [label_voc[z] for z in curr_out]
            out_lengths.append(len(curr_out))

            curr_trimmed_in = in_trimmed_seqs[idx]
            in_trimmed_np[idx, :len(curr_trimmed_in)] = [label_voc[z] for z in curr_trimmed_in]
            in_trimmed_lengths.append(len(curr_trimmed_in))

            curr_trimmed_out = out_trimmed_seqs[idx]
            out_trimmed_np[idx, :len(curr_trimmed_out)] = [label_voc[z] for z in curr_trimmed_out]
            out_trimmed_lengths.append(len(curr_trimmed_out))

        # cut off extra padding
        in_trans_np = in_trans_np[:, :np.max(in_lengths)]
        out_trans_np = out_trans_np[:, :np.max(out_lengths)]
        in_trimmed_np = in_trimmed_np[:, :np.max(in_trimmed_lengths)]
        out_trimmed_np = out_trimmed_np[:, :np.max(out_trimmed_lengths)]

        return in_trans_np, out_trans_np, in_trimmed_np, out_trimmed_np,\
            np.array(in_lengths, dtype='int32'), np.array(out_lengths, dtype='int32'),\
            np.array(in_trimmed_lengths, dtype='int32'), np.array(out_trimmed_lengths, dtype='int32')
Beispiel #25
0
def indexify_transformations(in_p, out_p, label_voc, args):

    in_seqs = []
    out_seqs = []
    mismatch_inds = []

    max_trans_size = 0
    for idx in range(len(in_p)):

        # very rarely, a tree is invalid
        try:
            in_tree = ParentedTree.fromstring(in_p[idx])
            out_tree = ParentedTree.fromstring(out_p[idx])
        except:
            continue

        if args.tree_dropout > 0:
            tree_dropout(in_tree, args.tree_dropout, 0)
            tree_dropout(out_tree, args.tree_dropout, 0)
        elif args.tree_level_dropout > 0:
            parse_tree_level_dropout(in_tree, args.tree_level_dropout)
            parse_tree_level_dropout(out_tree, args.tree_level_dropout)

        in_full_trans = deleaf(in_tree)
        out_full_trans = deleaf(out_tree)

        if max_trans_size < len(in_full_trans):
            max_trans_size = len(in_full_trans)
        if max_trans_size < len(out_full_trans):
            max_trans_size = len(out_full_trans)

        # only consider instances where input syntax differs from output syntax
        if in_full_trans != out_full_trans:
            # make sure everything is invocab
            try:
                x = [label_voc[z] for z in in_full_trans]
                x = [label_voc[z] for z in out_full_trans]
                in_seqs.append(in_full_trans)
                out_seqs.append(out_full_trans)
                mismatch_inds.append(idx)
            except:
                pass

    # no syntactic transformations in the batch!
    if len(in_seqs) == 0:
        return None

    # otherwise, indexify and return
    else:
        in_trans_np = np.zeros((len(in_seqs), max_trans_size), dtype='int32')
        out_trans_np = np.zeros((len(in_seqs), max_trans_size), dtype='int32')

        in_lengths = []
        out_lengths = []
        for idx in range(len(in_seqs)):
            curr_in = in_seqs[idx]
            in_trans_np[idx, :len(curr_in)] = [label_voc[z] for z in curr_in]
            in_lengths.append(len(curr_in))

            curr_out = out_seqs[idx]
            out_trans_np[idx, :len(curr_out)] = [label_voc[z] for z in curr_out]
            out_lengths.append(len(curr_out))

        return in_trans_np, out_trans_np, mismatch_inds,\
            np.array(in_lengths, dtype='int32'), np.array(out_lengths, dtype='int32')
Beispiel #26
0
'''
Main function.
'''
if __name__ == "__main__":

    print("Beginning parse of PTB.ext...")

    data = []
    num_lines = rawgencount("PTB.ext")

    with open('PTB.ext', encoding='utf-8') as f:
        for sent_tree in tqdm(f, total=num_lines):

            # Parse this sent_tree into an NLTK tree object
            tree = ParentedTree.fromstring(sent_tree)

            # Get all phrases in this tree
            for phrase in get_coordphrases(tree):

                conjuncts = phrase[0]
                conjunction = phrase[1]
                phrase_cat = phrase[2]
                phrase_text = phrase[3]
                sent_text = get_tree_text(tree)

                # Only include two-termed coordinations
                if len(conjuncts) != 2:
                    continue

                row = []
def encode_data(text, parsed_repr, bpe, pp_vocab, parse_gen_voc, parse_net,
                tp_templates, tp_template_lens, net, rev_label_voc,
                rev_pp_vocab):
    stime = time.time()
    ssent = ' '.join(text.split())
    seg_sent = bpe.segment(ssent.lower()).split()

    results = []

    results.append(reverse_bpe(seg_sent))
    # encode sentence using pp_vocab, leave one word for EOS
    seg_sent = [pp_vocab[w] for w in seg_sent if w in pp_vocab]

    # add EOS
    seg_sent.append(pp_vocab['EOS'])
    torch_sent = Variable(
        torch.from_numpy(np.array(seg_sent, dtype='int32')).long().cuda())
    torch_sent_len = torch.from_numpy(np.array([len(seg_sent)],
                                               dtype='int32')).long().cuda()

    # encode parse using parse vocab
    parse_tree = ParentedTree.fromstring(parsed_repr.strip())
    parse_tree = deleaf(parse_tree)
    np_parse = np.array([parse_gen_voc[w] for w in parse_tree], dtype='int32')
    torch_parse = Variable(torch.from_numpy(np_parse).long().cuda())
    torch_parse_len = torch.from_numpy(
        np.array([len(parse_tree)], dtype='int32')).long().cuda()

    # generate full parses from templates
    beam_dict = parse_net.batch_beam_search(torch_parse.unsqueeze(0),
                                            tp_templates,
                                            torch_parse_len[:],
                                            tp_template_lens,
                                            parse_gen_voc['EOP'],
                                            beam_size=3,
                                            max_steps=150)
    seq_lens = []
    seqs = []
    for b_idx in beam_dict:
        prob, _, _, seq = beam_dict[b_idx][0]
        seq = seq[:-1]  # chop off EOP
        seq_lens.append(len(seq))
        seqs.append(seq)
    np_parses = np.zeros((len(seqs), max(seq_lens)), dtype='int32')
    for z, seq in enumerate(seqs):
        np_parses[z, :seq_lens[z]] = seq
    tp_parses = Variable(torch.from_numpy(np_parses).long().cuda())
    tp_len = torch.from_numpy(np.array(seq_lens, dtype='int32')).long().cuda()

    # generate paraphrases from parses
    try:
        beam_dict = net.batch_beam_search(torch_sent.unsqueeze(0),
                                          tp_parses,
                                          torch_sent_len[:],
                                          tp_len,
                                          pp_vocab['EOS'],
                                          beam_size=3,
                                          max_steps=40)
        for b_idx in beam_dict:
            prob, _, _, seq = beam_dict[b_idx][0]
            gen_parse = ' '.join([rev_label_voc[z] for z in seqs[b_idx]])
            gen_sent = ' '.join([rev_pp_vocab[w] for w in seq[:-1]])
            results.append(reverse_bpe(gen_sent.split()))
    except:
        print("beam search OOM")
        print(traceback.format_exc())
    return results
Beispiel #28
0
def main():
    files = glob.glob(TMP_PATH + "*.info.xml")

    OUT = open("out.out", "w")

    EXAMPLE_OUT = open("examples/" + SOURCE + "_" + LABEL + ".json", "w")
    num_examples = 0

    total_files = len(files)
    no_tlinks = 0

    print("[", file=EXAMPLE_OUT)
    for file in files:
        print(file, file=OUT)
        soup = BeautifulSoup(open(file), "html.parser")

        sentence = soup.sentence

        print(sentence.string, file=OUT)

        # store tokens in a list.
        tokens = []
        for token in soup.tokens.find_all("t"):
            text = token.string.rsplit("\"", 3)[0].split("\"", 3)[-1]
            if text[0] == " ":
                text = text[1:]
            tokens.append(text)

        print(file=OUT)
        # parse events
        # <event id="e1" eiid="ei1" offset="2" string="said" tense="PAST"
        #  aspect="NONE" class="REPORTING" polarity="POS" modality="" happen=""
        #  lowerBoundDuration="" upperBoundDuration=""
        # />
        eid_dict = {}
        eiid_dict = {}
        for event in soup.events.find_all("event"):
            text = event["string"]
            token_pos = event["offset"]
            eid_dict[event["id"]] = text
            eiid_dict[event["eiid"]] = text
            print(text, file=OUT)

        # parse timexes
        # <timex tid="t1" text="autumn" offset="19" length="1" type="DATE"
        #  value="XXXX-FA" temporalFunction="false"/>
        timex_dict = {}
        for timex in soup.timexes.find_all("timex"):
            text = timex["text"]
            timex_dict[timex["tid"].strip()] = text
            print(text, file=OUT)

        print(file=OUT)

        tlinks = soup.find_all("tlink")
        if len(tlinks) == 0:
            no_tlinks += 1
            print("NO TLINKS", file=OUT)
        else:
            headers = ["e1", "e2", "relation"]
            table = []
            e1s = []
            e2s = []
            rels = []
            for tlink in tlinks:
                e1 = tlink["event1"]
                e2 = tlink["event2"]

                if e1 in eid_dict:
                    e1 = eid_dict[e1]
                elif e1 in eiid_dict:
                    e1 = eiid_dict[e1]
                elif e1 in timex_dict:
                    e1 = timex_dict[e1]
                else:
                    print("ERROR: Can't find e1", file=OUT)
                    print(eiid_dict)

                if e2 in eid_dict:
                    e2 = eid_dict[e2]
                elif e2 in eiid_dict:
                    e2 = eiid_dict[e2]
                elif e2 in timex_dict:
                    e2 = timex_dict[e2]
                else:
                    print("ERROR: Can't find e2", file=OUT)

                # print(e1, "\t", e2, "\t", tlink["relation"])
                table.append([e1, e2, tlink["relation"]])
            print(tabulate(table, headers=headers), file=OUT)
        print(file=OUT)

        parse = soup.parse.string

        t = ParentedTree.fromstring(parse)
        example = get_relation(t, LABEL)
        if example:
            if num_examples > 0:
                print(",", file=EXAMPLE_OUT)
            print(example.to_json(), file=EXAMPLE_OUT, end="")
            num_examples += 1

        print(parse, file=OUT)
        print(file=OUT)

    print("\n]", file=EXAMPLE_OUT)

    print("total files: ", total_files)
    print("files without tlinks: ", no_tlinks)
    print("files with failed event parsing: ", failed_event_parse)
    print("files with successful event parsing: ",
          total_files - failed_event_parse)
Beispiel #29
0
    def __populate_Parses(lang, parsejson, new_parsedict):
        """
        """
        # start CoreNLP servers for UD1
        from stanfordnlp.server import CoreNLPClient

        cwd = os.getcwd()
        version = 'stanford-corenlp-full-2018-10-05'
        corenlp_path = re.findall(r'\S*/marta-v2',
                                  cwd)[0] + '/04_utils/' + version
        os.environ["CORENLP_HOME"] = corenlp_path
        if lang == 'en':
            lang = {}  # i.e. CoreNLP defaults to English model
            corenlpclient_UD1 = CoreNLPClient(properties={
                'ssplit.isOneSentence': True,
                'tokenize.whitespace': True
            },
                                              annotators=[
                                                  'tokenize', 'ssplit', 'pos',
                                                  'parse', 'depparse',
                                                  'udfeats'
                                              ],
                                              memory='2G',
                                              be_quiet=True,
                                              max_char_length=100000,
                                              output_format='conllu')
            # parse annotator is necessary to obtain udfeats (for postags)

        if lang == 'fr':
            lang = 'french'
            corenlpclient_UD1 = CoreNLPClient(
                properties=lang,
                annotators=[
                    'tokenize', 'ssplit', 'pos', 'parse', 'depparse', 'udfeats'
                ],
                memory='2G',
                be_quiet=True,
                max_char_length=100000,
                output_format='conllu'
            )  # note that udfeats (for postags) currently works for english only https://stanfordnlp.github.io/CoreNLP/udfeats.html

        if lang == 'zh':
            lang = 'chinese'
            corenlpclient_UD1 = CoreNLPClient(properties=lang,
                                              annotators=[
                                                  'tokenize', 'ssplit', 'pos',
                                                  'parse', 'depparse',
                                                  'udfeats'
                                              ],
                                              memory='2G',
                                              be_quiet=True,
                                              max_char_length=100000,
                                              output_format='conllu')
            # note that udfeats (for postags) currently works for english only https://stanfordnlp.github.io/CoreNLP/udfeats.html

        # begin processing
        for DocID in parsejson:
            print('Now processing: ', dataset, DocID)
            sentence_offset = 0  # this is the 4th element in a TokenList

            # obtain the gold constituency parses for the document.
            ConstTrees = __obtain_ConstTrees_Gold(
                DocID, readpath='./03_data/{}/{}tbRoot/{}/', lang=LANG)

            for sentence in parsejson[DocID]['sentences']:
                # 1. create a ParsePDTB object
                __parsepdtb = ParsePDTB(
                    lang=LANG,
                    docid=DocID,
                    sentid=sentence_offset,
                    gold_consttree=ConstTrees[sentence_offset],
                    pdtb_version=PDTB_VERSION)

                # 2. add to .RawText and .Words
                __parsepdtb.RawText = " ".join(
                    [word[0] for word in sentence['words']])
                __parsepdtb.Words = sentence['words']

                # 3. add to ConstTree_Auto. generate parse if missing
                if sentence['parsetree'] == '(())\n':
                    _parse = a2_parsers._parse_rawtext2consttree(
                        LANG, __parsepdtb.RawText, tokenized=True)
                    __parsepdtb.ConstTree_Auto = _parse
                else:
                    __parsepdtb.ConstTree_Auto = sentence['parsetree']

                # 3. write to temp file, for converting to SD/UD1 in next steps
                with open('./02_modelbuilding/02_output/input_temp.parser',
                          'w+') as f:
                    f.write(__parsepdtb.ConstTree_Gold)

                # 4. convert constituency parse to gold UD 1.0 and add to DepTree_UD1_Gold
                a2_parsers.convert_const2dep(
                    LANG,
                    dataset,
                    filename='',
                    readpath='/02_modelbuilding/02_output/input_temp.parser',
                    writepath='/02_modelbuilding/02_output/output_temp.parser',
                    format_='UD1',
                    usage='experiments')

                with open('./02_modelbuilding/02_output/output_temp.parser',
                          'r') as f:
                    UD1_Gold_conllu = f.read()

                def __conllu2tuple(conllu_doc):
                    """helper function to convert CoNLL format into 3-tuple used by CoNLL 2016 organisers to store dependency parses
                    """
                    to_list = conllu_doc.split('\n')
                    tokenlist = [
                        i.split('\t')[1] + '-' + i.split('\t')[0]
                        for i in to_list if i != ''
                    ]  # convert  CoNLL line to <wordform>-<token num>
                    tokenlist.insert(0,
                                     'ROOT-0')  # add a root token to the start
                    deptree_gold = [
                        [
                            i.split('\t')[7], tokenlist[int(i.split('\t')[6])],
                            i.split('\t')[1] + '-' + i.split('\t')[0]
                        ] for i in to_list if i != ''
                    ]  # convert to CoNLL 2016 dependencies format
                    return deptree_gold

                __parsepdtb.DepTree_UD1_Gold = __conllu2tuple(UD1_Gold_conllu)

                # 5. automatically generate UD 1.0 constituency parse (from raw text), place into same 3-tuple format as CoNLL 2016 Shared Task,and add to DepTree_UD1_Auto
                UD1_Auto_conllu = corenlpclient_UD1.annotate(
                    __parsepdtb.RawText)
                __parsepdtb.DepTree_UD1_Auto = __conllu2tuple(UD1_Auto_conllu)

                # 6. add PTB-style and UD pos tags to .Words. Each of the variable below contain a list comprising 2-tuples. each tuple is (<wordform>, <part of speech>)

                globals()['pos_PTBGold'] = [
                    i for i in ParentedTree.fromstring(
                        __parsepdtb.ConstTree_Gold).pos() if i[-1] != '-NONE-'
                ]  # gold PTB parses have traces and these causes misalignment with the surface form. we drop these since parsers don't predict traces (Johannsen & Søgaard, 2013)
                globals()['pos_PTBAuto'] = ParentedTree.fromstring(
                    __parsepdtb.ConstTree_Auto).pos()
                globals()['pos_UDGold'] = [(i.split('\t')[1], i.split('\t')[3])
                                           for i in UD1_Gold_conllu.split('\n')
                                           if i != '']
                globals()['pos_UDAuto'] = [(i.split('\t')[1], i.split('\t')[3])
                                           for i in UD1_Auto_conllu.split('\n')
                                           if i != '']

                for postagset in ['PTBGold', 'PTBAuto', 'UDGold', 'UDAuto']:
                    try:
                        _tagset = globals()['pos_' + postagset]
                        assert len(_tagset) == len(__parsepdtb.Words)
                        for idx in range(len(__parsepdtb.Words)):
                            # add the part of speech as a new key in the dictionary for the token in .Words
                            __parsepdtb.Words[idx][1].update(
                                {'PartOfSpeech_' + postagset: _tagset[idx][1]})

                    except AssertionError as e:

                        e.args += (
                            postagset.upper() +
                            " is not of the same size as the .Words attribute for this sentence.",
                        )
                        print(e)
                        print("Continuing to attempt alignment of tokens.")
                        _words = [i[0] for i in __parsepdtb.Words]
                        _words_maxidx = len(_words) - 1

                        #'drop' the additional tokens in _tagset
                        _tagset = [i for i in _tagset if i[0] in _words]
                        _words_curridx = -1  # start with -1
                        for idx in range(len(_tagset)):
                            _words_curridx += 1
                            while __parsepdtb.Words[_words_curridx][
                                    0] != _tagset[idx][
                                        0] and _words_curridx < _words_maxidx:
                                __parsepdtb.Words[_words_curridx][1].update(
                                    {
                                        'PartOfSpeech_' + postagset:
                                        'ParserError'
                                    }
                                )  # place a marker identifying the missing pos tag as an error from parsing
                                _words_curridx += 1
                            __parsepdtb.Words[_words_curridx][1].update(
                                {'PartOfSpeech_' + postagset: _tagset[idx][1]})
                            continue
                        # raise
                sentence_offset += 1  # increase sentence offset before moving to handle next sentence

                try:
                    new_parsedict[DocID].append(__parsepdtb)
                except:
                    new_parsedict[DocID] = [__parsepdtb]

        # shut down the CoreNLP servers
        corenlpclient_UD1.stop()
Beispiel #30
0
 def test_read_indexed_tree(self):
     leafreader = IndexedLeafReader()
     tree = ParentedTree.fromstring('(S1 (S (S (NP (PRP I|0|1)) (VP (VBP have|2|6) (NP (DT a|3|4) (NN book|9|13)))) (. .|13|14)))',
                                    read_leaf=leafreader.read_leaf)
     print(tree.pprint(margin=float("inf")))
def encode_data(out_file):

    fn = ['idx', 'template', 'generated_parse', 'sentence']
    ofile = codecs.open(out_file, 'w', 'utf-8')
    out = csv.DictWriter(ofile, delimiter='\t', fieldnames=fn)
    out.writerow(dict((x,x) for x in fn))

    # read parsed data
    infile = codecs.open(args.parsed_input_file, 'r', 'utf-8', errors='ignore')
    inrdr = csv.DictReader(infile, delimiter='\t')

    # loop over sentences and transform them
    for d_idx, ex in enumerate(inrdr):
        stime = time.time()
        ssent = ' '.join(ex['tokens'].split())
        seg_sent = bpe.segment(ssent.lower()).split()

        # write gold sentence
        out.writerow({'idx': ex['idx'],
            'template':'GOLD', 'generated_parse':ex['parse'], 
            'sentence':reverse_bpe(seg_sent)})

        # encode sentence using pp_vocab, leave one word for EOS
        seg_sent = [pp_vocab[w] for w in seg_sent if w in pp_vocab]

        # add EOS
        seg_sent.append(pp_vocab['EOS'])
        if args.gpu >= 0:
            torch_sent = Variable(torch.from_numpy(np.array(seg_sent, dtype='int32')).long().cuda())
            torch_sent_len = torch.from_numpy(np.array([len(seg_sent)], dtype='int32')).long().cuda()
        else:
            torch_sent = Variable(torch.from_numpy(np.array(seg_sent, dtype='int32')).long())
            torch_sent_len = torch.from_numpy(np.array([len(seg_sent)], dtype='int32')).long()

        # encode parse using parse vocab
        parse_tree = ParentedTree.fromstring(ex['parse'].strip())
        parse_tree = deleaf(parse_tree)
        np_parse = np.array([parse_gen_voc[w] for w in parse_tree], dtype='int32')
        if args.gpu >= 0:
            torch_parse = Variable(torch.from_numpy(np_parse).long().cuda())
            torch_parse_len = torch.from_numpy(np.array([len(parse_tree)], dtype='int32')).long().cuda()
        else:
            torch_parse = Variable(torch.from_numpy(np_parse).long())
            torch_parse_len = torch.from_numpy(np.array([len(parse_tree)], dtype='int32')).long()

        # generate full parses from templates
        beam_dict = parse_net.batch_beam_search(torch_parse.unsqueeze(0), tp_templates,
            torch_parse_len[:], tp_template_lens, parse_gen_voc['EOP'], beam_size=3, max_steps=150)
        seq_lens = []
        seqs = []
        for b_idx in beam_dict:
            prob,_,_,seq = beam_dict[b_idx][0]
            seq = seq[:-1] # chop off EOP
            seq_lens.append(len(seq))
            seqs.append(seq)
        np_parses = np.zeros((len(seqs), max(seq_lens)), dtype='int32')
        for z, seq in enumerate(seqs):
            np_parses[z, :seq_lens[z]] = seq
        if args.gpu >= 0:
            tp_parses = Variable(torch.from_numpy(np_parses).long().cuda())
            tp_len = torch.from_numpy(np.array(seq_lens, dtype='int32')).long().cuda()
        else:
            tp_parses = Variable(torch.from_numpy(np_parses).long())
            tp_len = torch.from_numpy(np.array(seq_lens, dtype='int32')).long()
        

        # generate paraphrases from parses
        # try:
        beam_dict = net.batch_beam_search(torch_sent.unsqueeze(0), tp_parses,
            torch_sent_len[:], tp_len, pp_vocab['EOS'], beam_size=3, max_steps=40)
        for b_idx in beam_dict:
            prob,_,_,seq = beam_dict[b_idx][0]
            gen_parse = ' '.join([rev_label_voc[z] for z in seqs[b_idx]])
            gen_sent = ' '.join([rev_pp_vocab[w] for w in seq[:-1]])
            out.writerow({'idx': ex['idx'],
                'template':templates[b_idx], 'generated_parse':gen_parse,
                'sentence':reverse_bpe(gen_sent.split())})
        # except:
        #     print('beam search OOM')

        print(d_idx, time.time() - stime)
                   dest="parse",
                   default="data/projectsyndicate/projectsyndicate.truecased.de.20.parse",
                   help="german parses parse persentence")
    opt.add_option("-m", dest="map", default="data/projectsyndicate/de-negra.map")
    opt.add_option("-s", dest="word2unipos", default="data/projectsyndicate/de.pos")
    (options, _) = opt.parse_args()
    word2pos = {}
    f = codecs.open(options.parse, 'r', 'utf-8')
    posmap = {}
    for line in codecs.open(options.map, 'r', 'utf-8').readlines():
        de_pos, uni_pos = line.split()
        posmap[de_pos] = uni_pos

    with f:
        for line in f:
            t = ParentedTree.fromstring(line.strip())
            for pos_token in t.subtrees(lambda t: t.height() == 2):
                pos_token = str(pos_token)[1:-1]
                (pos, token) = pos_token.split()
                token = token.encode('utf-8')
                s = word2pos.get(token, set())
                s.add(posmap[pos])
                word2pos[token] = s
    f.close()
    w = codecs.open(options.word2unipos, 'w', 'utf-8')
    for token, set_pos in word2pos.items():
        print token, ':', ','.join(set_pos)
        w.write(token.encode('utf-8') + '\t' + ','.join(set_pos) + '\n')
    w.flush()
    w.close()
Beispiel #33
0
def tree_to_ptree(tree: nltk.Tree):
    tree_str = tree.__str__()
    ptree = PTree.fromstring(tree_str)
    return ptree
Beispiel #34
0
 def __init__(self, parse_string):
     self.__tree = ParentedTree.fromstring(parse_string)
Beispiel #35
0
    args = get_args()

    i = 1
    tot = str(len(args.input_files))

    for file in args.input_files:

        print("(" + str(i) + "/" + tot + ")")
        print("Gathering coordination stats from " + file + "...")

        sents = pd.read_csv(file)
        data = []

        for index, row in tqdm(sents.iterrows(), total=len(sents.index)):
            parse_tree = row["Sentence Parse Tree"]
            tree = ParentedTree.fromstring(parse_tree)
            sent = get_tree_text(tree)
            for coord in get_simple_coordphrases(tree):
                category1 = coord[0][0]
                conjunct1 = coord[0][1]
                conjunction = coord[1]
                category2 = coord[2][0]
                conjunct2 = coord[2][1]
                data.append([category1, conjunct1, category2, conjunct2,
                             conjunction, sent, parse_tree])

        columns = ['1st Conjunct Category', '1st Conjunct Text',
                   '2nd Conjunct Category', '2nd Conjunct Text',
                   'Conjunction', 'Sentence Text', 'Sentence Parse Tree']
        df = pd.DataFrame(data, columns=columns)
        df.drop_duplicates(inplace=True)
Beispiel #36
0
 def test_read_normal_tree(self):
     leafreader = LeafReader('I have a book.')
     tree = ParentedTree.fromstring('(S1 (S (S (NP (PRP I)) (VP (VBP have) (NP (DT a) (NN book)))) (. .)))',
                            read_leaf=leafreader.read_leaf)
     print(tree.pprint(margin=float("inf")))
Beispiel #37
0
    def match(self, tree):
        if not isinstance(tree, ParentedTree):
            raise AttributeError

        self._query_tree = ParentedTree.fromstring(str(tree))
Beispiel #38
0
def train(epoch, model, train_data, valid_data, train_loader, valid_loader, optimizer, criterion, dictionary, bpe, args):
    
    timer = Timer()
    n_it = len(train_loader)
    
    for it, data_idxs in enumerate(train_loader):
        model.train()
        
        data_idxs = np.sort(data_idxs.numpy())
        
        # get batch of raw sentences and raw syntax
        sents_ = train_data[0][data_idxs]
        synts_ = train_data[1][data_idxs]
            
        batch_size = len(sents_)
        
        # initialize tensors
        sents = np.zeros((batch_size, args.max_sent_len), dtype=np.long)    # words without position
        synts = np.zeros((batch_size, args.max_synt_len+2), dtype=np.long)  # syntax
        targs = np.zeros((batch_size, args.max_sent_len+2), dtype=np.long)  # target output
        
        for i in range(batch_size):
            
            # bpe segment and convert to tensor
            sent_ = sents_[i]
            sent_ = bpe.segment(sent_).split()
            sent_ = [dictionary.word2idx[w] if w in dictionary.word2idx else dictionary.word2idx["<unk>"] for w in sent_]
            sents[i, :len(sent_)] = sent_
            
            # add <sos> and <eos> for target output
            targ_ = [dictionary.word2idx["<sos>"]] + sent_ + [dictionary.word2idx["<eos>"]]
            targs[i, :len(targ_)] = targ_
            
            # parse syntax and convert to tensor
            synt_ = synts_[i]
            synt_ = ParentedTree.fromstring(synt_)
            synt_ = deleaf(synt_)
            synt_ = [dictionary.word2idx[f"<{w}>"] for w in synt_ if f"<{w}>" in dictionary.word2idx]
            synt_ = [dictionary.word2idx["<sos>"]] + synt_ + [dictionary.word2idx["<eos>"]]
            synts[i, :len(synt_)] = synt_
            
        sents = torch.from_numpy(sents).cuda()
        synts = torch.from_numpy(synts).cuda()
        targs = torch.from_numpy(targs).cuda()
        
        # forward
        outputs = model(sents, synts, targs)
        
        # calculate loss
        targs_ = targs[:, 1:].contiguous().view(-1)
        outputs_ = outputs.contiguous().view(-1, outputs.size(-1))
        optimizer.zero_grad()
        loss = criterion(outputs_, targs_)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        
        if it % args.log_interval == 0:
            # print current loss
            valid_loss = evaluate(model, valid_data, valid_loader, criterion, dictionary, bpe, args)
            print("| ep {:2d}/{} | it {:3d}/{} | {:5.2f} s | loss {:.4f} | g_norm {:.6f} | valid loss {:.4f} |".format(
                epoch, args.n_epoch, it, n_it, timer.get_time_from_last(), loss.item(), model.grad_norm, valid_loss))
            
        if it % args.gen_interval == 0:
            # generate output to args.output_dir
            generate(epoch, it, model, valid_data, valid_loader, dictionary, bpe, args)
            
        if it % args.save_interval == 0:
            # save model to args.model_dir
            torch.save(model.state_dict(), os.path.join(args.model_dir, "synpg_epoch{:02d}.pt".format(epoch)))