Ejemplo n.º 1
0
def get_tokens(fname, stopwords):
    with open(fname, encoding='utf-8') as f:
        text = f.read()

    text = re.sub(r'\d', '9', text)
    # word_re = re.compile(r'(\p{L}[\p{L}_-]+|\p{P}+)')
    word_re = re.compile(r'(\p{L}[\p{L}_-]*|\p{N}+)')
    tokens = word_re.findall(text)

    # retain = set(['NOUN', 'ADJ', 'ADV', 'PROPN'])
    retain = set(['NOUN', 'ADJ'])
    # pos_tagged = pos_tag(tokens)
    # print "pos_tags:", " ".join("{}_{}".format(*t) for t in pos_tagged)
    tokens = [tok for tok, tag in pos_tag(tokens)
              if map_tag('en-ptb', 'universal', tag) in retain]

    tokens = [tok.lower() for tok in tokens]
    tokens = [tok for tok in tokens if tok not in stopwords]

    stemmer = PorterStemmer()
    tokens = [stemmer.stem(tok) for tok in tokens]

    # print "====="
    # print "tokens:", " ".join(tokens)
    # print "====="

    return tokens
Ejemplo n.º 2
0
def _pos_tag(tokens, tagset=None, tagger=None, lang=None):
    # Currently only supoorts English and Russian.
    if lang not in ['eng', 'rus']:
        raise NotImplementedError(
            "Currently, NLTK pos_tag only supports English and Russian "
            "(i.e. lang='eng' or lang='rus')")
    else:
        tagged_tokens = tagger.tag(tokens)
        if tagset:  # Maps to the specified tagset.
            if lang == 'eng':
                tagged_tokens = [(token, map_tag('en-ptb', tagset, tag))
                                 for (token, tag) in tagged_tokens]
            elif lang == 'rus':
                # Note that the new Russion pos tags from the model contains suffixes,
                # see https://github.com/nltk/nltk/issues/2151#issuecomment-430709018
                tagged_tokens = [(token,
                                  map_tag('ru-rnc-new', tagset,
                                          tag.partition('=')[0]))
                                 for (token, tag) in tagged_tokens]
        return tagged_tokens
Ejemplo n.º 3
0
def create_corpus(tagged):
    """ Take a list of tagged words and return a corpus as a list of tagged
    words with universal tagging and filtering all non-word entries """
    corpus = []
    for pair in tagged:
        if match(r'[a-zA-Z0-9_-]+', pair[0]):
            try:
                corpus.append(Word(pair[0],
                              map_tag('en-ptb', 'universal', pair[1])))
            except KeyError:
                print 'Part mismatch:', pair[0], pair[1]
    return corpus
    def compute_features(self, s, count):

        # preprocess
        tok_sent = nltk.tokenize.word_tokenize(s)
        stop_tok_sent = [x for x in tok_sent if x not in cachedStopWords]

        # location features
        P = 1.0/count
        F5 = 1 if count <=5 else 0
        LEN = len(stop_tok_sent)/30.0

        # language modelling
        LM = LModel.score(s)

        # pos tagging features
        tag_fd = FreqDist(map_tag("en-ptb", "universal",tag) if map_tag("en-ptb", "universal",tag) not in cachedStopPOStags else "OTHER" for (word, tag) in pos_tagger(tok_sent))
        NN = tag_fd.freq("NOUN")
        VB = tag_fd.freq("VERB")

        # headline-sentence similarity
        VS1 = 1 - spatial.distance.cosine(self.hl_vsv_1.toarray(), self.father.cv.transform([s]).toarray())
        TFIDF = 1 - spatial.distance.cosine(self.hl_tfidf.toarray(), self.father.tv.transform([s]).toarray())

        # topic description-sentence similarity
        CT = 1 - spatial.distance.cosine(self.father.desc_vsv.toarray(), self.father.cv.transform([s]).toarray())
        Q = 1 - spatial.distance.cosine(self.father.title_vsv.toarray(), self.father.cv.transform([s]).toarray())

        # security checks
        if math.isnan(VS1):
            VS1 = 0
            print self.father.code, self.id
        if math.isnan(CT):
            CT = 0
            print self.father.code, self.id
        if math.isnan(Q):
            Q = 0
            print self.father.code, self.id

        # active features
        return np.asarray([P, F5, LEN, LM, VS1, TFIDF, VB, NN, CT, Q])
Ejemplo n.º 5
0
def tagstr2tree(s,
                chunk_label="NP",
                root_label="S",
                sep="/",
                source_tagset=None,
                target_tagset=None):
    """
    Divide a string of bracketted tagged text into
    chunks and unchunked tokens, and produce a Tree.
    Chunks are marked by square brackets (``[...]``).  Words are
    delimited by whitespace, and each word should have the form
    ``text/tag``.  Words that do not contain a slash are
    assigned a ``tag`` of None.

    :param s: The string to be converted
    :type s: str
    :param chunk_label: The label to use for chunk nodes
    :type chunk_label: str
    :param root_label: The label to use for the root of the tree
    :type root_label: str
    :rtype: Tree
    """

    WORD_OR_BRACKET = re.compile(r"\[|\]|[^\[\]\s]+")

    stack = [Tree(root_label, [])]
    for match in WORD_OR_BRACKET.finditer(s):
        text = match.group()
        if text[0] == "[":
            if len(stack) != 1:
                raise ValueError("Unexpected [ at char {:d}".format(
                    match.start()))
            chunk = Tree(chunk_label, [])
            stack[-1].append(chunk)
            stack.append(chunk)
        elif text[0] == "]":
            if len(stack) != 2:
                raise ValueError("Unexpected ] at char {:d}".format(
                    match.start()))
            stack.pop()
        else:
            if sep is None:
                stack[-1].append(text)
            else:
                word, tag = str2tuple(text, sep)
                if source_tagset and target_tagset:
                    tag = map_tag(source_tagset, target_tagset, tag)
                stack[-1].append((word, tag))

    if len(stack) != 1:
        raise ValueError("Expected ] at char {:d}".format(len(s)))
    return stack[0]
Ejemplo n.º 6
0
    def extract_reverb_patterns(text):

        text_tokens = word_tokenize(text)
        tags_ptb = pos_tag(text_tokens)
        tags = []
        for t in tags_ptb:
            tag = map_tag('en-ptb', 'universal', t[1])
            tags.append((t[0], tag))

        patterns = []
        patterns_tags = []
        i = 0
        limit = len(tags)-1
        while i <= limit:
            tmp = io.StringIO()
            tmp_tags = []

            # a ReVerb pattern always starts with a verb
            if tags[i][1] == 'VERB':
                tmp.write(tags[i][0]+' ')
                t = (tags[i][0], tags[i][1])
                tmp_tags.append(t)
                i += 1

                # V = verb particle? adv? (also capture auxiliary verbs)
                while i <= limit and tags[i][1] in ['VERB', 'PRT', 'ADV']:
                    tmp.write(tags[i][0]+' ')
                    t = (tags[i][0], tags[i][1])
                    tmp_tags.append(t)
                    i += 1

                # W = (noun | adj | adv | pron | det)
                while i <= limit and tags[i][1] in ['NOUN', 'ADJ', 'ADV',
                                                    'PRON', 'DET']:
                    tmp.write(tags[i][0]+' ')
                    t = (tags[i][0], tags[i][1])
                    tmp_tags.append(t)
                    i += 1

                # P = (prep | particle | inf. marker)
                while i <= limit and tags[i][1] in ['ADP', 'PRT']:
                    tmp.write(tags[i][0]+' ')
                    t = (tags[i][0], tags[i][1])
                    tmp_tags.append(t)
                    i += 1
                # add the build pattern to the list collected patterns
                patterns.append(tmp.getvalue())
                patterns_tags.append(tmp_tags)
            i += 1

        return patterns, patterns_tags
    def parse(self, orig_tokens):
        if orig_tokens and type(orig_tokens[0]) is tuple:
            tokens = [token for token, _ in orig_tokens]
        else:
            tokens = orig_tokens

        tokenized_ud = list(
            map(lambda x: (x[0], map_tag('ru-rnc', 'universal', x[1])),
                pos_tag(tokens, lang='rus')))
        tokenized_nltk = pos_tag(tokens, lang='rus')
        tokenized_mystem = [(token, self.mystem_tagger.tag_word(token)[0][1])
                            for token in tokens]

        # print(self.chunker_iis.parse(tokenized_ud))

        tags_nltk = self.chunker_nltk.parse(tokenized_nltk, return_tree=False)
        tags_ud = self.chunker_nltk.parse(tokenized_ud, return_tree=False)
        tags_mystem = self.chunker_nltk.parse(tokenized_mystem,
                                              return_tree=False)
        tags_iis = tree2conlltags(self.chunker_iis.parse(tokenized_ud))
        tags_grammar = tree2conlltags(
            self.grammar_chunker.parse(tokenized_mystem))

        result_tags = [tags_nltk, tags_ud, tags_mystem, tags_grammar, tags_iis]

        if tokens is orig_tokens:
            tag_source = tags_ud
        else:
            tag_source = orig_tokens

        tags = [(token, tag_source[ind][1],
                 pick_tag([tags_sp[ind][2]
                           for tags_sp in result_tags], tags_ud[ind][1]))
                for ind, token in enumerate(tokens)]

        # for ind, (token,pos,iob_tag) in enumerate(tags):
        #     if token in set(['таких', 'такие', 'такими', 'как', 'включая', 'и', 'или','другие', 'других', 'другими', 'особенно', 'в', 'частности', ',']):
        #         tags[ind] = (token, pos, 'O')

        for ind, (token, pos, iob_tag) in enumerate(tags):
            if ind == 0:
                continue
            if iob_tag == "B-NP*":
                if tags[ind - 1][2] in {'B-NP', 'I-NP'}:
                    tags[ind] = (token, pos, 'I-NP')
                else:
                    tags[ind] = (token, pos, 'B-NP')
            if iob_tag == "I-NP" and tags[ind - 1][2] not in {'B-NP', 'I-NP'}:
                tags[ind] = (token, pos, 'B-NP')

        return conlltags2tree(tags)
Ejemplo n.º 8
0
def _pos_tag(tokens, tagset=None, tagger=None, lang=None):
    # Currently only supoorts English and Russian.
    if lang not in ['eng', 'rus']:
        raise NotImplementedError(
            "Currently, NLTK pos_tag only supports English and Russian "
            "(i.e. lang='eng' or lang='rus')"
        )
    else:
        tagged_tokens = tagger.tag(tokens)
        if tagset:  # Maps to the specified tagset.
            if lang == 'eng':
                tagged_tokens = [
                    (token, map_tag('en-ptb', tagset, tag))
                    for (token, tag) in tagged_tokens
                ]
            elif lang == 'rus':
                # Note that the new Russion pos tags from the model contains suffixes,
                # see https://github.com/nltk/nltk/issues/2151#issuecomment-430709018
                tagged_tokens = [
                    (token, map_tag('ru-rnc-new', tagset, tag.partition('=')[0]))
                    for (token, tag) in tagged_tokens
                ]
        return tagged_tokens
Ejemplo n.º 9
0
def _pos_tag(tokens, tagset=None, tagger=None, lang=None):
    # Currently only supports English and Russian.
    if lang not in ["eng", "rus"]:
        raise NotImplementedError(
            "Currently, NLTK pos_tag only supports English and Russian "
            "(i.e. lang='eng' or lang='rus')")
    # Throws Error if tokens is of string type
    elif isinstance(tokens, str):
        raise TypeError("tokens: expected a list of strings, got a string")

    else:
        tagged_tokens = tagger.tag(tokens)
        if tagset:  # Maps to the specified tagset.
            if lang == "eng":
                tagged_tokens = [(token, map_tag("en-ptb", tagset, tag))
                                 for (token, tag) in tagged_tokens]
            elif lang == "rus":
                # Note that the new Russian pos tags from the model contains suffixes,
                # see https://github.com/nltk/nltk/issues/2151#issuecomment-430709018
                tagged_tokens = [(token,
                                  map_tag("ru-rnc-new", tagset,
                                          tag.partition("=")[0]))
                                 for (token, tag) in tagged_tokens]
        return tagged_tokens
Ejemplo n.º 10
0
def featurize(line, idnum, pat):
    
    word_tag = hpt.tag(line.split()) # tag all sentence
    last = len(word_tag) - 1
    past_nuc = ''
    past_kind = ''
    
    filename = pat+'/'+str(idnum)   
    with open(str(filename), 'a') as file: #open json file
        
        for (i,(word,tag)) in enumerate(word_tag):
            current_word = []
            current_word.append(('word',word.lower()))
            current_word.append(('tag',tag))
            current_word.append(('collps_tag',map_tag(source='en-ptb',target='universal', source_tag=tag))) # source = 'en-ptb' is for wsj [TBD no wsj?]
            current_word.append(('function',bool(word in function)))
            current_word.append(('negation',bool(word in negation)))
            try: 
                phns = d[word.lower()][0]
            except:
                f.write(word)
                f.write('\n') #find if it is a number or dot or pound                
                continue
            sylls, num_syll, nuc, kind = syll_detector(phns)
            current_word.append(('sylls', sylls))             
            current_word.append(('num_sylls', num_syll))
            current_word.append(('nuc', nuc))
            current_word.append(('nuc_kind', kind))
            
            
            if i > 0: # if not first word
                past_word.append(('right_nuc',nuc)) 
                past_word.append(('right_nuc_kind',kind)) 
                
                json.dump(OrderedDict(past_word), file, indent=4) #copy past_word to json
                current_word.append(('left_nuc',past_nuc))
                current_word.append(('left_nuc_kind',past_kind))
            past_nuc = nuc # after updating current_word, current nuc becomes past_nuc
            past_kind = kind
                        
            if i == 0:
                current_word.append(('left_nuc','None'))
                
            if i == last:
                current_word.append(('right_nuc','None'))
                json.dump(OrderedDict(current_word), file, indent=4) #copy final_word to jason
            
            past_word = copy.deepcopy(current_word) # keep past dict
Ejemplo n.º 11
0
    def test_reverb_patterns_extraction(sentences):
        for line in fileinput.input(sentences):
            #s = line.split('sentence:')[1].strip()
            text_tokens = word_tokenize(re.sub(r"</?e[1-2]>|\"", "", line))
            tagged = pos_tag(text_tokens)

            # convert the tags to reduced tagset (Petrov et al. 2012)
            # http://arxiv.org/pdf/1104.2086.pdf
            tags = []
            for t in tagged:
                tag = map_tag('en-ptb', 'universal', t[1])
                tags.append((t[0], tag))

            #r = Relationship(None, s, None, None, None)
            #extractRelationalWords(r)
            print tags
Ejemplo n.º 12
0
def tagstr2tree(
    s, chunk_label="NP", root_label="S", sep='/', source_tagset=None, target_tagset=None
):
    """
    Divide a string of bracketted tagged text into
    chunks and unchunked tokens, and produce a Tree.
    Chunks are marked by square brackets (``[...]``).  Words are
    delimited by whitespace, and each word should have the form
    ``text/tag``.  Words that do not contain a slash are
    assigned a ``tag`` of None.

    :param s: The string to be converted
    :type s: str
    :param chunk_label: The label to use for chunk nodes
    :type chunk_label: str
    :param root_label: The label to use for the root of the tree
    :type root_label: str
    :rtype: Tree
    """

    WORD_OR_BRACKET = re.compile(r'\[|\]|[^\[\]\s]+')

    stack = [Tree(root_label, [])]
    for match in WORD_OR_BRACKET.finditer(s):
        text = match.group()
        if text[0] == '[':
            if len(stack) != 1:
                raise ValueError('Unexpected [ at char {:d}'.format(match.start()))
            chunk = Tree(chunk_label, [])
            stack[-1].append(chunk)
            stack.append(chunk)
        elif text[0] == ']':
            if len(stack) != 2:
                raise ValueError('Unexpected ] at char {:d}'.format(match.start()))
            stack.pop()
        else:
            if sep is None:
                stack[-1].append(text)
            else:
                word, tag = str2tuple(text, sep)
                if source_tagset and target_tagset:
                    tag = map_tag(source_tagset, target_tagset, tag)
                stack[-1].append((word, tag))

    if len(stack) != 1:
        raise ValueError('Expected ] at char {:d}'.format(len(s)))
    return stack[0]
Ejemplo n.º 13
0
def pos_tag(tokens, tagset=None):
    """
    Use NLTK's currently recommended part of speech tagger to
    tag the given list of tokens.

        >>> from nltk.tag import pos_tag # doctest: +SKIP
        >>> from nltk.tokenize import word_tokenize # doctest: +SKIP
        >>> pos_tag(word_tokenize("John's big idea isn't all that bad.")) # doctest: +SKIP
        [('John', 'NNP'), ("'s", 'POS'), ('big', 'JJ'), ('idea', 'NN'), ('is',
        'VBZ'), ("n't", 'RB'), ('all', 'DT'), ('that', 'DT'), ('bad', 'JJ'),
        ('.', '.')]

    :param tokens: Sequence of tokens to be tagged
    :type tokens: list(str)
    :return: The tagged tokens
    :rtype: list(tuple(str, str))
    """
    tagger = load(_POS_TAGGER)
    if tagset:
        return [(token, map_tag('en-ptb', tagset, tag)) for (token, tag) in tagger.tag(tokens)]
    return tagger.tag(tokens)
Ejemplo n.º 14
0
def pos_tag(tokens, tagset=None):
    """
    Use NLTK's currently recommended part of speech tagger to
    tag the given list of tokens.

        >>> from nltk.tag import pos_tag # doctest: +SKIP
        >>> from nltk.tokenize import word_tokenize # doctest: +SKIP
        >>> pos_tag(word_tokenize("John's big idea isn't all that bad.")) # doctest: +SKIP
        [('John', 'NNP'), ("'s", 'POS'), ('big', 'JJ'), ('idea', 'NN'), ('is',
        'VBZ'), ("n't", 'RB'), ('all', 'DT'), ('that', 'DT'), ('bad', 'JJ'),
        ('.', '.')]

    :param tokens: Sequence of tokens to be tagged
    :type tokens: list(str)
    :return: The tagged tokens
    :rtype: list(tuple(str, str))
    """
    tagger = load(_POS_TAGGER)
    if tagset:
        return [(token, map_tag('en-ptb', tagset, tag))
                for (token, tag) in tagger.tag(tokens)]
    return tagger.tag(tokens)
def load_web_eng(filename = "", trans = False):
    lines = list( open(filename, "r").readlines() )
    lines = [ l.strip() for l in lines]

    doc = []
    tags = []
    sent_w = []
    sent_t = []
    for l in lines:
        if l == '':
            doc.append(sent_w)
            tags.append(sent_t)
            sent_w = []
            sent_t = []
        else:
            w, t = l.split('\t')
            if t != "-NONE-":
                sent_w.append( w.lower() )
                if trans:
                    sent_t.append( map_tag('en-ptb', 'universal', t) )
                else:
                    sent_t.append( t )
    return doc, tags
Ejemplo n.º 16
0
    def morph(self, source, reference, constrain_pos=True):
        orig_tokenized = MosesTokenizer(lang='en').tokenize(source)
        pos_tagged = [(token, map_tag("en-ptb", 'universal', tag))
                      for (token, tag) in self.tagger.tag(orig_tokenized)]
        pos_tagged = [(tagged[0], '.') if '&' in tagged[0] else tagged for tagged in pos_tagged]

        token_inflections = self.get_inflections(orig_tokenized, pos_tagged, constrain_pos)

        original_score, orig_predicted = self.get_score(source, reference)

        forward_perturbed, forward_score, \
        forward_predicted, num_queries_forward = self.search_seq2seq(token_inflections,
                                                                     orig_tokenized,
                                                                     source,
                                                                     original_score,
                                                                     reference)
        if forward_score == original_score:
            forward_predicted = orig_predicted

        if forward_score == 0:
            return MosesDetokenizer(lang='en').detokenize(forward_perturbed), forward_predicted, num_queries_forward + 1

        backward_perturbed, backward_score, \
        backward_predicted, num_queries_backward = self.search_seq2seq(token_inflections,
                                                                       orig_tokenized,
                                                                       source,
                                                                       original_score,
                                                                       reference,
                                                                       backward=True)
        if backward_score == original_score:
            backward_predicted = orig_predicted
        num_queries = 1 + num_queries_forward + num_queries_backward
        if forward_score < backward_score:
            return MosesDetokenizer(lang='en').detokenize(forward_perturbed), forward_predicted, num_queries
        else:
            return MosesDetokenizer(lang='en').detokenize(backward_perturbed), backward_predicted, num_queries
Ejemplo n.º 17
0
def featurize(indir, pat, jsons):
    """ construct json file per sentence s.t. every word is described by: 
    its tag, its collapsed tag (according to NLTK 3.0), a function 
    word (bool), negation (bool), vowels, num of syllables, and current
    past and future of words in terms of the nucleuos and level of 
    prominence """
    
    outdir = pat + '/' + indir.rsplit("/", 1)[-1]    
    if not os.path.exists(outdir):
        os.makedirs(outdir)    
    past_nuc = ''
    past_kind = ''
    number_pattern = re.compile("(\d+)?[.,-]?(\\/)?\d+(th)?('s)?['s]?(\w+)?")
    hiphend_pattern_num_word_mix = re.compile("(\w+(.\w+)?(\w+)?)-\w+")
    past_word = []
    last = len(jsons)
    first_dumped = 0 # since not necessarily the first word is dumped
    for (i, terminal) in enumerate(dict_the_json(indir, jsons)):

        word = str(terminal["word"].lower())
        subword = str(terminal["subword"].lower())
        if word != subword:
            # print word, subword
            pass

        try: # if "repair" or not exist keeps going 
            if terminal["disf_stat"] == "reparandum":
                continue
        except:
            pass

        current_word = []

        # if find list of shortcut verb suffix (no "'s")
        if past_word and (word in ["'d", "'ll", "'n'", "'re", "'ve",\
                                   "n't", "'m"]
                          or (past_word[0][1] + word == subword and \
                              subword in ["he's", "she's", "it's", "that's", "what's"])
                          or (past_word[0][1] + word == subword and word != "'s")):
            past_word[0][1] = past_word[0][1] + word
            try:
                phns = d[past_word[0][1].lower()][0]
            except:
                # print past_word[0][1].lower()
                phn1 = d[past_word[0][1][:-len(word)]][0]
                phn2 = d[str(word)][0]
                phns = phn1 + phn2

            sylls, num_syll, nuc, kind = syll_detector(phns)
            for k in xrange(7): # max sylls
                try:
                    current_word.append((k, sylls[k]))
                except:
                    current_word.append((k, "NA"))  
                    
            #past_word[5][1] = sylls
            #past_word[6][1] = num_syll
            #past_word[7][1] = nuc
            #past_word[8][1] = kind
            #past_word[12][1] = num_syll
            past_word[12][1] = nuc
            past_word[13][1] = kind
            continue


        current_word.append(['word', word])
        current_word.append(('tag', terminal["tag"]))
        current_word.append(
            ('collps_tag', map_tag(Source, target='universal',\
                                   source_tag=terminal["tag"])))
        
        current_word.append(('function', bool(word) in function))
        current_word.append(('negation', bool(word) in negation))
        try:
            phns = d[word][0]
        except:
            # hiphen seperated
            if re.findall(hiphend_pattern_num_word_mix, word):
                phns = []
                for w in word.split('-'):
                    try:
                        phn = d[w.lower()][0]
                        phns.extend(phn)
                    except:
                        # and there's a number
                        if re.findall(number_pattern, word):
                            w = 'five'
                            phn = d[w.lower()][0]
                            phns.extend(phn)
            else:
                if word == ",":
                    print 'h'
                # print word

                f.write(word)
                f.write('\n')  # find if it is a number or dot or pound
                word = 'name'  # unrecognized word or pattern
                current_word[0][1] = word
                phns = d[word.lower()][0]

        sylls, num_syll, nuc, kind = syll_detector(phns)
        for k in xrange(7): # max sylls
            try:
                current_word.append((k, sylls[k]))
            except:
                current_word.append((k, "NA"))   
                
        current_word.append(['nuc', nuc])
        current_word.append(['nuc_kind', kind])
        try:  # dialact
            current_word.append(("dialAct", terminal["dialAct:niteType"]))
            current_word.append(("seq", terminal["dialAct:id"]))          
        except:
            current_word.append(("dialAct", "no"))
        try:  # kontrast
            current_word.append(
                ("kontrast level", terminal["kontrast:level"]))
            current_word.append(
                ("kontrast type", terminal["kontrast:type"]))
        except:
            current_word.append(
                ("kontrast level", "no"))
            current_word.append(
                ("kontrast type", "no"))            
        try:  # phrases
            current_word.append(('phrases', terminal["phrases:type"]))
        except:
            current_word.append(('phrases', "no"))

        try:  # accents
            current_word.append(
                ('accents_strength', terminal["accents:strength"]))
        except:
            pass

        if first_dumped and i!=last:  # if not first dumped word 
            past_word.append(('right_nuc', nuc))
            past_word.append(('right_nuc_kind', kind))

            # copy past_word to json
            with open(outdir+'/'+past_id, 'w') as fjson:
                json.dump(OrderedDict(past_word), fjson, indent=4)
            current_word.append(('left_nuc', past_nuc))
            current_word.append(('left_nuc_kind', past_kind))
        # after updating current_word, current nuc becomes past_nuc
        past_nuc = nuc
        past_kind = kind

        if not first_dumped:
            current_word.append(('left_nuc', 'None'))
            first_dumped = 1

        if i == last:
            current_word.append(('right_nuc', 'None'))
            # copy final_word to jason
            with open(outdir+'/'+past_id, 'w') as fjson:
                json.dump(OrderedDict(current_word), fjson, indent=4)            

        past_word = copy.deepcopy(current_word)  # keep past dict
        past_id = terminal["id"]
Ejemplo n.º 18
0
def _pos_tag(tokens, tagset, tagger):
    tagged_tokens = tagger.tag(tokens)
    if tagset:
        tagged_tokens = [(token, map_tag('en-ptb', tagset, tag))
                         for (token, tag) in tagged_tokens]
    return tagged_tokens
Ejemplo n.º 19
0
def _pos_tag(tokens, tagset, tagger):
    tagged_tokens = tagger.tag(tokens)
    if tagset:
        tagged_tokens = [(token, map_tag('en-ptb', tagset, tag)) for (token, tag) in tagged_tokens]
    return tagged_tokens
Ejemplo n.º 20
0
    def morph(self,
              question_dict,
              context,
              constrain_pos=True,
              conservative=False):
        original = question_dict['question']

        gold_starts = [ans['answer_start'] for ans in question_dict['answers']]
        gold_texts = [ans['text'] for ans in question_dict['answers']]
        gold_ends = [
            gold_starts[i] + len(text) for i, text in enumerate(gold_texts)
        ]
        question_dict['gold_char_spans'] = list(zip(gold_starts, gold_ends))
        question_dict['gold_texts'] = gold_texts

        orig_tokenized = MosesTokenizer(lang='en').tokenize(original)

        pos_tagged = [(token, map_tag("en-ptb", 'universal', tag))
                      for (token, tag) in self.tagger.tag(orig_tokenized)]
        pos_tagged = [(tagged[0], '.') if '&' in tagged[0] else tagged
                      for tagged in pos_tagged]

        token_inflections = super(MorpheusQA, self).get_inflections(
            orig_tokenized, pos_tagged, constrain_pos)

        original_loss, init_predicted = self.get_loss(original, question_dict,
                                                      context)

        if self.metric_max(compute_f1, init_predicted,
                           question_dict['gold_texts']) == 0:
            return original, init_predicted, 1

        forward_perturbed, \
        forward_loss, forward_predicted, \
        num_queries_forward = self.search_qa(token_inflections,
                                             orig_tokenized,
                                             original_loss,
                                             question_dict,
                                             context,
                                             conservative)

        if conservative and self.metric_max(compute_f1, forward_predicted,
                                            question_dict['gold_texts']) == 0:
            return MosesDetokenizer(lang='en').detokenize(
                forward_perturbed), forward_predicted, num_queries_forward + 1

        backward_perturbed, \
        backward_loss, backward_predicted, \
        num_queries_backward = self.search_qa(token_inflections,
                                              orig_tokenized,
                                              original_loss,
                                              question_dict,
                                              context,
                                              conservative,
                                              backward=True)

        num_queries = 1 + num_queries_forward + num_queries_backward
        if forward_loss > backward_loss:
            return MosesDetokenizer(lang='en').detokenize(
                forward_perturbed), forward_predicted, num_queries
        else:
            return MosesDetokenizer(lang='en').detokenize(
                backward_perturbed), backward_predicted, num_queries
Ejemplo n.º 21
0
def generate_dataset_json(files,
                          out_file,
                          lemma_know=None,
                          relationship_know=None):
    d = {}
    lemmas = set()
    stemmer = PorterStemmer()

    if os.path.isfile(out_file):
        with open(out_file, 'r') as f:
            data = f.read()
            d = json.loads(data)

    if lemma_know is not None:
        d.update(lemma_know)

    if relationship_know is None:
        relationship_know = {}

    for f in files:
        with open(f) as file:
            for line in tqdm.tqdm(file):
                l = line.strip()

                tokens = l.split()
                if len(l) == 0 or tokens[12] != 'Y':
                    continue

                lemma = tokens[2]
                pos = tokens[4]

                if lemma == '%':
                    k = '%25_' + mapping.map_tag('wsj', 'universal', pos)
                else:
                    k = lemma.lower() + '_' + mapping.map_tag(
                        'wsj', 'universal', pos)
                lemmas.add(k)

    lemmas = sorted(list(lemmas))
    for l in tqdm.tqdm(lemmas):
        if l not in d:

            lemma, pos = l.rsplit('_', 1)
            print(lemma, pos, 'to add.')

            # print(mapping.map_tag('wsj', 'universal', pos))
            b = utils.getAssociatedSynsetsBabelnet(lemma, pos, BABEL_KEY)

            if b == -1:
                return d

            if len(b) == 0:
                b = utils.getAssociatedSynsetsBabelnet(lemma,
                                                       pos,
                                                       BABEL_KEY,
                                                       pos='POS')
            if len(b) == 0:
                b = utils.getAssociatedSynsetsBabelnet(lemma,
                                                       pos,
                                                       BABEL_KEY,
                                                       wn=False)
            if len(b) == 0:
                b = utils.getAssociatedSynsetsBabelnet(lemma,
                                                       pos,
                                                       BABEL_KEY,
                                                       wn=False,
                                                       pos='POS')

            if len(b) == 0:
                lemma = stemmer.stem(lemma)
                print('Stemmer used: ', lemma)

                b = utils.getAssociatedSynsetsBabelnet(lemma, pos, BABEL_KEY)
                if len(b) == 0:
                    b = utils.getAssociatedSynsetsBabelnet(lemma,
                                                           pos,
                                                           BABEL_KEY,
                                                           pos='POS')
                if len(b) == 0:
                    b = utils.getAssociatedSynsetsBabelnet(lemma,
                                                           pos,
                                                           BABEL_KEY,
                                                           wn=False)
                if len(b) == 0:
                    b = utils.getAssociatedSynsetsBabelnet(lemma,
                                                           pos,
                                                           BABEL_KEY,
                                                           wn=False,
                                                           pos='POS')

            print(b)
            # assert(len(b) > 0)

            to_add = {}
            for s in b:
                links = relationship_know.get(
                    s, utils.getSemanticRelatioshipBabelnet(s, BABEL_KEY))
                to_add.update({s: links})

            d.update({l: to_add})

            with open(out_file, 'w') as f:
                f.write(json.dumps(d, ensure_ascii=False))
        else:
            print(l, 'already present')

    with open(out_file, 'w') as f:
        f.write(json.dumps(d, ensure_ascii=False))

    return d
Ejemplo n.º 22
0
    def extract_reverb_patterns(text):
        """
        Extract ReVerb relational patterns
        http://homes.cs.washington.edu/~afader/bib_pdf/emnlp11.pdf

        VERB - verbs (all tenses and modes)
        NOUN - nouns (common and proper)
        PRON - pronouns
        ADJ - adjectives
        ADV - adverbs
        ADP - adpositions (prepositions and postpositions)
        CONJ - conjunctions
        DET - determiners
        NUM - cardinal numbers
        PRT - particles or other function words
        X - other: foreign words, typos, abbreviations
        . - punctuation

        # extract ReVerb patterns:
        # V | V P | V W*P
        # V = verb particle? adv?
        # W = (noun | adj | adv | pron | det)
        # P = (prep | particle | inf. marker)
        """

        # split text into tokens
        text_tokens = PunktWordTokenizer().tokenize(text)

        # tag the sentence, using the default NTLK English tagger
        # POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle'
        tags_ptb = pos_tag(text_tokens)

        # convert the tags to reduced tagset (Petrov et al. 2012)
        # http://arxiv.org/pdf/1104.2086.pdf
        tags = []
        for t in tags_ptb:
            tag = map_tag('en-ptb', 'universal', t[1])
            tags.append((t[0], tag))

        patterns = []
        patterns_tags = []
        i = 0
        limit = len(tags)-1

        while i <= limit:
            tmp = StringIO.StringIO()
            tmp_tags = []

            # a ReVerb pattern always starts with a verb
            if tags[i][1] == 'VERB':
                tmp.write(tags[i][0]+' ')
                t = (tags[i][0], tags[i][1])
                tmp_tags.append(t)
                i += 1

                # V = verb particle? adv? (also capture auxiliary verbs)
                while i <= limit and tags[i][1] in ['VERB', 'PRT', 'ADV']:
                    tmp.write(tags[i][0]+' ')
                    t = (tags[i][0], tags[i][1])
                    tmp_tags.append(t)
                    i += 1

                # W = (noun | adj | adv | pron | det)
                while i <= limit and tags[i][1] in ['NOUN', 'ADJ', 'ADV', 'PRON', 'DET']:
                    tmp.write(tags[i][0]+' ')
                    t = (tags[i][0], tags[i][1])
                    tmp_tags.append(t)
                    i += 1

                # P = (prep | particle | inf. marker)
                while i <= limit and tags[i][1] in ['ADP', 'PRT']:
                    tmp.write(tags[i][0]+' ')
                    t = (tags[i][0], tags[i][1])
                    tmp_tags.append(t)
                    i += 1
                # add the build pattern to the list collected patterns
                patterns.append(tmp.getvalue())
                patterns_tags.append(tmp_tags)
            i += 1

        return patterns, patterns_tags
Ejemplo n.º 23
0
\tlabelloc="t";
\tlabel="%s";
\t""" %(
    '\\n'.join(
        textwrap.wrap(self.sentence)
    )
)
        
        output += u'\n\t'.join(map(operator.attrgetter('dot_str'), self.nodes))
        output += u'\n\n\t'
        output += u'\n\t'.join(map(operator.attrgetter('dot_str'), self.edges))
        output += u'\n'
        output += u'};\n'
        return output

map_tag_to_universal = lambda tag: map_tag('en-ptb', 'universal', tag)

STANFORD_ATTRIBUTES = {"Text": {"name": "token", "type": unicode}, 
                       "PartOfSpeech": {"name": "pos_tag", "type": str, "mapping_func": map_tag_to_universal}, 
                       "Lemma": {"name": "lemma", "type": str}}
def parse_token_line(l, prepend_root = True):
    """
    Parsing the line containing tokens and POS tags information

    l: str
         The token&POS line

    prepend_root: bool
         If True, root is automatically prepended to the list
    
    >>> tokens = parse_token_line(u"[Text=Schneider CharacterOffsetBegin=0 CharacterOffsetEnd=9 PartOfSpeech=NNP] [Text=Electric CharacterOffsetBegin=10 CharacterOffsetEnd=18 PartOfSpeech=NNP]", prepend_root = True)
Ejemplo n.º 24
0
import sys
from providedcode.transitionparser import TransitionParser
from providedcode.evaluate import DependencyEvaluator
from providedcode.dependencygraph import DependencyGraph
from nltk.tag import mapping

if len(sys.argv) != 2:
	sys.stderr.write("No model provided.")
	sys.exit(1)

tp = TransitionParser.load(sys.argv[1])

for sentence in sys.stdin: 
    s = DependencyGraph.from_sentence(sentence) #class DependencyGraph, function from_sentence
    for node in s.nodes:
            tag = s.nodes[node]['tag']
            ctag = mapping.map_tag('wsj','universal',tag)
            s.nodes[node]['ctag'] = ctag
    x = tp.parse([s])
    print x[0].to_conll(10).encode('utf-8')

# model: sys.argv(1) - english.model
Ejemplo n.º 25
0
    def extract_reverb_patterns(text):
        """
        Extract ReVerb relational patterns
        http://homes.cs.washington.edu/~afader/bib_pdf/emnlp11.pdf

        VERB - verbs (all tenses and modes)
        NOUN - nouns (common and proper)
        PRON - pronouns
        ADJ - adjectives
        ADV - adverbs
        ADP - adpositions (prepositions and postpositions)
        CONJ - conjunctions
        DET - determiners
        NUM - cardinal numbers
        PRT - particles or other function words
        X - other: foreign words, typos, abbreviations
        . - punctuation

        # extract ReVerb patterns:
        # V | V P | V W*P
        # V = verb particle? adv?
        # W = (noun | adj | adv | pron | det)
        # P = (prep | particle | inf. marker)
        """

        # split text into tokens
        text_tokens = PunktWordTokenizer().tokenize(text)

        # tag the sentence, using the default NTLK English tagger
        # POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle'
        tags_ptb = pos_tag(text_tokens)

        # convert the tags to reduced tagset (Petrov et al. 2012)
        # http://arxiv.org/pdf/1104.2086.pdf
        tags = []
        for t in tags_ptb:
            tag = map_tag('en-ptb', 'universal', t[1])
            tags.append((t[0], tag))

        patterns = []
        patterns_tags = []
        i = 0
        limit = len(tags) - 1

        while i <= limit:
            tmp = StringIO.StringIO()
            tmp_tags = []

            # a ReVerb pattern always starts with a verb
            if tags[i][1] == 'VERB':
                tmp.write(tags[i][0] + ' ')
                t = (tags[i][0], tags[i][1])
                tmp_tags.append(t)
                i += 1

                # V = verb particle? adv? (also capture auxiliary verbs)
                while i <= limit and tags[i][1] in ['VERB', 'PRT', 'ADV']:
                    tmp.write(tags[i][0] + ' ')
                    t = (tags[i][0], tags[i][1])
                    tmp_tags.append(t)
                    i += 1

                # W = (noun | adj | adv | pron | det)
                while i <= limit and tags[i][1] in [
                        'NOUN', 'ADJ', 'ADV', 'PRON', 'DET'
                ]:
                    tmp.write(tags[i][0] + ' ')
                    t = (tags[i][0], tags[i][1])
                    tmp_tags.append(t)
                    i += 1

                # P = (prep | particle | inf. marker)
                while i <= limit and tags[i][1] in ['ADP', 'PRT']:
                    tmp.write(tags[i][0] + ' ')
                    t = (tags[i][0], tags[i][1])
                    tmp_tags.append(t)
                    i += 1
                # add the build pattern to the list collected patterns
                patterns.append(tmp.getvalue())
                patterns_tags.append(tmp_tags)
            i += 1

        return patterns, patterns_tags
Ejemplo n.º 26
0
Tagging part of speech
Use maxent treebank pos tagging model in NLTK by default
Each consisting of a list of tokens
"""


print string.punctuation
with codecs.open("res/sentences_train.csv", "rU") as f:
    csvreader = csv.reader(f)
    for row in csvreader:
        # remove punctuation
        # Ignore ascii decode error
        sentence = row[0].translate(string.maketrans("", ""), punctuation).decode('ascii', 'ignore')
        text = nltk.word_tokenize(sentence)
        original_tag = nltk.pos_tag(text)
        simplified_tag = [(word, map_tag('en-ptb','universal', tag)) for word, tag in original_tag] # Map original Map to universla tags
        simplified_tag = [(u'START',u'START')] + simplified_tag + [(u'END',u'END')]  # manually add two tags
        #tokens.append(simplified_tag)
        tokens.extend(simplified_tag)
print tokens[0:10]


# TODO:BenchMark
# Using word association to generate suggestion
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(word for word,tag in tokens)
scored = finder.score_ngrams(bigram_measures.raw_freq)
print sorted(bigram for bigram, score in scored)

finder.apply_freq_filter(2)
scored = finder.score_ngrams(bigram_measures.raw_freq)