Beispiel #1
0
def clean_googlengram(line):
    """Removes speechtags from line specific to the googlengram module

    Param:
        line (unicode)
    Returns:
        line (unicode)
    """
    return_line = line.split("\t")[
        0]  # Get the ngram, remove year, counter, etc
    clean = []
    words = WhitespaceTokenizer().tokenize(return_line)
    for word in words:
        # in >1-grams transitions to specific tags are written as:
        # The_ADJ _NOUN_ (meaning from The there is a transition to a noun
        # We remove those
        if word[0] != '_' and word[-1] != '_':
            # Split the token and the tag based on the '_'
            token, tag = str2tuple(word, '_')
            # Punct will be added using rules.
            if len(token) > 1:
                if tag != 'PUNCT' or tag != '.' or tag != '':
                    clean.append(token)
            elif token not in punctuation:
                clean.append(token)
    return_line = ' '.join(clean)
    if return_line != line:
        return True, return_line
    else:
        return False, line
Beispiel #2
0
def align(sent):
    tagged = re.split(r'\s+', sent)
    raw_word = tagged[0]
    tagged[1] = re.compile(r'__[0-9]+').sub('', tagged[1])
    tag_morph = re.split("(?<=/[A-Z]{2})\+|(?<=/[A-Z]{3})\+", tagged[1])
    tagged = ''.join([morph_pos[:morph_pos.rfind('/')] for morph_pos in tag_morph])
    fraction = list()
    for morph_tag in tag_morph:
        morph, tag = nltk.str2tuple(morph_tag)

        for i, syl in enumerate(morph):
            if i == 0:
                fraction.append([syl, "B-"+tag])
            else:
                fraction.append([syl, "I-" + tag])
        fraction[-1][1] = fraction[-1][1] + "+"  ##태그 뒤에 +붙이기
    fraction[-1][1] = fraction[-1][1][:-1]
    print(raw_word,tagged)
    if raw_word == tagged:
        return fraction
    SM = SequenceMatcher(None, raw_word, tagged)
    blocks = list()
    if include_delete(SM):
        blocks = make_del_block(fraction, raw_word, tagged)
    else:
        mat_blocks = SM.get_matching_blocks()
        blocks = generate_block(fraction, mat_blocks)
        if len(mat_blocks) == 1:# 온 오/vx+ㄴ/etm 혹시 모를 다틀린 형태.
            blocks = make_del_block(fraction, raw_word, tagged)

    print(blocks)
    for cur, nxt in pairwise(blocks):
        raw = raw_word[cur[0]:cur[1]]
        mor = tagged[cur[2]:cur[3]]
        print(raw,mor)
Beispiel #3
0
 def lex_smooth(self, data):
     """Smoothing of the whole language model"""
     p_smoothed = defaultdict(float)
     for token in data:
         word, tag = str2tuple(token)
         p_smoothed[(word, tag)] = self.get_probs(word, tag)
     return p_smoothed
Beispiel #4
0
 def unpack_keyword(item):
     val, vty, pty = item
     if analyse_pos:
         word, pos = nltk.str2tuple(val)
     else:
         word = val
         pos = "N/A"
     return (word, pos, vty*pty)
def tagged_corpus(filename):
    """
    Lazily read tagged corpus from a file `filename`
    """
    try:
        with open(filename, "r") as source:
            for line in source:
                (tokens, tags) = zip(*(str2tuple(wt) for wt in line.split()))
                yield (list(tokens), list(tags))
    except IOError as error:
        exit(error)
def parser_mystr2tuple(s,minLength=1):
    output_list=[]
    if len(s)>4:
        s=s.replace("(u'","('")
        s=s[2:len(s)-2]
        s=s.replace("'","")
        lst=s.split("), (")

        for i in range(0,len(lst)):
            word, tag = lst[i].split(", ")
            if len(word)>=minLength or tag=='CD':
                output_list.append(nltk.str2tuple(word+"/"+tag))
    return output_list
Beispiel #7
0
def parser_mystr2tuple(s, minLength=1):
    output_list = []
    if len(s) > 4:
        s = s.replace("(u'", "('")
        s = s[2:len(s) - 2]
        s = s.replace("'", "")
        lst = s.split("), (")

        for i in range(0, len(lst)):
            word, tag = lst[i].split(", ")
            if len(word) >= minLength or tag == 'CD':
                output_list.append(nltk.str2tuple(word + "/" + tag))
    return output_list
Beispiel #8
0
def split_syn(s1, s2):
    result = []
    raw = ''.join(s1)
    tag_morph = re.split("(?<=/[A-Z]{2})\+|(?<=/[A-Z]{3})\+", ''.join(s2))
    tagged = ''.join(
        [morph_pos[:morph_pos.rfind('/')] for morph_pos in tag_morph])

    fraction = []
    for morph_tag in tag_morph:
        morph, tag = nltk.str2tuple(morph_tag)
        for mor_i in range(len(morph)):
            if mor_i == 0:
                fraction.append([morph[mor_i], "B-" + tag])
            else:
                fraction.append([morph[mor_i], "I-" + tag])
        fraction[-1][1] = fraction[-1][1] + "+"  ##태그 뒤에 +붙이기
    fraction[-1][1] = fraction[-1][1][:-1]
    if raw == tagged:
        temp = []
        for i in range(len(fraction)):
            temp.append(raw[i])
            temp.extend(fraction[i])
            result.append(temp)
            temp = []
        return result
    blocks = make_del_block(fraction, raw, tagged)

    for block in blocks[:-1]:
        raw_b, raw_e, tag_b, tag_e = block[0], block[1], block[2], block[3]
        temp = []
        if raw_e - raw_b == tag_e - tag_b:
            for i in range(raw_e - raw_b):
                temp.append(raw[raw_b + i])
                temp.extend(fraction[tag_b + i])
                result.append(temp)
                temp = []
        elif raw_e - raw_b == 1:
            for i in range(tag_e - tag_b):
                temp.extend(fraction[tag_b + i])
            tag_syn = ''.join([temp[i] for i in range(0, len(temp), 2)])
            tag = ''.join([temp[i] for i in range(1, len(temp), 2)])
            result.append([raw[raw_b], tag_syn, tag])

    return result
Beispiel #9
0
 def from_str(cls, string):
     (tokens, tags) = zip(*(str2tuple(tt) for tt in string.split()))
     return cls(tokens, tags)
How/WRB much/JJ did/VBZ ganguly/NNP hit/VBZ 6s/CD in/IN match/NN 1/CD ?/.
How/WRB much/JJ did/VBZ BB/NNP McCullum/NNP hit/VBZ 6s/CD in/IN match/NN 1CD/ ?/.
6s/CD hit/VBZ by/IN BB/NNP McCullum/NNP match/NN 1CD/ ?/.
4s/CD hit/VBZ by/IN BB/NNP McCullum/NNP match/NN 1CD/ ?/.
How/WRB much/JJ did/VBZ sc/NNP ganguly/NNP hit/VBZ 4s/CD in/IN match/NN 1/CD ?/.
How/WRB much/JJ did/VBZ ganguly/NNP hit/VBZ 4/CD in/IN match/NN 1/CD ?/.
How/WRB much/JJ did/VBZ BB/NNP McCullum/NNP hit/VBZ 4s/CD in/IN match/NN 1CD/ ?/.
how/WRB many/JJ balls/NN were/VBZ faced/VBZ by/IN BB/NNP McCullum/NNP in/IN match/NN 3/CD ?/.
how/WRB many/JJ deliveries/NN were/VBZ faced/VBZ by/IN BB/NNP Mccullum/NNP in/IN match/NN 3/CD ?/.                  
"""

# In[90]:

tagged_question = []
for word in word_tokenize(corrected_train):
    tagged_question.append(nltk.str2tuple(word))
#print(tagged_question)
train_data = []
train_data.append(tagged_question)

# In[91]:

from nltk.data import load
pos_tag = load('taggers/maxent_treebank_pos_tagger/english.pickle')

# In[92]:

from nltk.tag import SequentialBackoffTagger


class POSTagger(SequentialBackoffTagger):
Beispiel #11
0
t1 = nltk.UnigramTagger(x_train, backoff=t0)
t2 = nltk.BigramTagger(x_train, backoff=t1)
print (t2.evaluate(x_test))  #0.863

#基于Unigram训练一个中文词性标注器,语料使用网上可以下载得到的人民日报98年1月的标注资料
import nltk
import json

lines = open('词性标注人民日报.txt',encoding='utf-8').readlines()
all_tagged_sents = []

for line in lines:
    sent = line.split()
    tagged_sent = []
    for item in sent:
        pair = nltk.str2tuple(item)
        tagged_sent.append(pair)

    if len(tagged_sent)>0:
        all_tagged_sents.append(tagged_sent)

train_size = int(len(all_tagged_sents)*0.8)
x_train = all_tagged_sents[:train_size]
x_test = all_tagged_sents[train_size:]

tagger = nltk.UnigramTagger(train=x_train,backoff=nltk.DefaultTagger('n'))

tokens = nltk.word_tokenize(u'我 认为 不丹 的 被动 卷入 不 构成 此次 对峙 的 主要 因素。')
tagged = tagger.tag(tokens)
#["我", "R"], ["认为", "V"], ["不丹", "n"], ["的", "U"], ["被动", "A"], ["卷入", "V"], ["不", "D"], ["构成", "V"], ["此次", "R"], ["对峙", "V"], ["的", "U"], ["主要", "B"], ["因素。", "n"]
print (tagger.evaluate(x_test)) #0.871
Beispiel #12
0
 def from_str(cls, string):
     (tokens, tags) = zip(*(str2tuple(tt) for tt in string.split()))
     return cls(tokens, tags)
Beispiel #13
0
def prep_scan(nb_words=None,
              skip_top=0,
              maxlen=None,
              test_split=0.2,
              seed=113,
              start_char=1,
              oov_char=2,
              index_from=3):

    from nltk import str2tuple

    with open("Data/CLFL_all_data.txt", "r") as f:
        raw_data = f.read()

    # separate sylls and labels and reject WBY
    data = [str2tuple(x) for x in raw_data.split()]
    data_lines = [[str2tuple(x) for x in line.split()]
                  for line in raw_data.split('\n')]
    data_lines = [[tup for tup in line if tup[0] != "WBY"]
                  for line in data_lines]

    # sylls to IDs
    sylls = [x[0] for x in data]
    sylls_lines = [[x[0] for x in line] for line in data_lines]
    sylls_set = list(set(sylls))
    sylls_ids = {}
    rev_sylls_ids = {}
    for i, x in enumerate(sylls_set):
        sylls_ids[x] = i + 1  # so we can pad with 0s
        rev_sylls_ids[i + 1] = x

    # labels to IDs
    tags = [x[1] for x in data]
    tags_lines = [[x[1] for x in line] for line in data_lines]
    tags_set = list(set(tags))
    print(len(tags_set))
    tags_ids = {}
    rev_tags_ids = {}
    for i, x in enumerate(tags_set):
        tags_ids[x] = i + 1  # so we can pad with 0s
        rev_tags_ids[i + 1] = x

    # lines of syll IDs
    all_sylls_ids = []
    for line in sylls_lines:
        s_l = [sylls_ids[x] for x in line]
        all_sylls_ids.append(s_l)

    # lines of label IDs
    all_tags_ids = []
    for line in tags_lines:
        t_l = [tags_ids[x] for x in line]
        all_tags_ids.append(t_l)

    X, labels = all_sylls_ids, all_tags_ids
    maxlen = len(max(labels, key=len))  # longest line in items

    # train and test split
    X_train = np.array(X[:int(len(X) * (1 - test_split))])
    y_train = np.array(labels[:int(len(X) * (1 - test_split))])

    X_test = np.array(X[int(len(X) * (1 - test_split)):])
    y_test = np.array(labels[int(len(X) * (1 - test_split)):])

    # pad with 0s
    print("Pad sequences (samples x time)")
    X_train = sequence.pad_sequences(X_train, value=0.)  # must be float
    X_test = sequence.pad_sequences(X_test, value=0.)

    print('X_train shape:', X_train.shape)
    print('X_test shape:', X_test.shape)

    # need to pad y too, because more than 1 output value
    y_train = sequence.pad_sequences(np.array(y_train), value=0.)
    y_test = sequence.pad_sequences(np.array(y_test), value=0.)

    y_train = [np_utils.to_categorical(y) for y in y_train]
    y_test = [np_utils.to_categorical(y) for y in y_test]

    # create 3D array for keras multi-classfication
    new_y_train = []
    for array in y_train:
        if len(array[0]) < 10:
            to_add = 10 - len(array[0])
            new_y_train.append(
                np.hstack((array, np.zeros((array.shape[0], to_add)))))
        else:
            new_y_train.append(array)

    y_train = np.asarray(new_y_train)

    # create 3D array for keras multi-classfication
    new_y_test = []
    for array in y_test:
        if len(array[0]) < 10:
            to_add = 10 - len(array[0])
            new_y_test.append(
                np.hstack((array, np.zeros((array.shape[0], to_add)))))
        else:
            new_y_test.append(array)

    y_test = np.asarray(new_y_test)

    print('y_train shape:', y_train.shape)
    print('y_test shape:', y_test.shape)

    return ((X_train, y_train), (X_test, y_test), maxlen, rev_sylls_ids,
            rev_tags_ids)
Beispiel #14
0
def calcTransition(text):
    f = open(text, "r")

    sentenceList = []

    for line in f:
        sentenceList.append(line)
    #print(sentenceList)

    #split into words/tags only
    sentenceListSplit = []
    for line in sentenceList:
        sentenceListSplit.append(line.split())
    #print(sentenceListSplit)

    #split into word/tags only for sentences
    sentenceListWordTags = []
    for sentence in sentenceListSplit:
        sentences = []
        for word in sentence:
            sentences.append(nl.str2tuple(word))
        sentenceListWordTags.append(sentences)

    tagsOnly = []
    for sentence in sentenceListWordTags:
        sentenceTags = []
        for tup in sentence:
            sentenceTags.append(tup[1])
        tagsOnly.append(sentenceTags)

    #make list with start tag for rows table
    tagsWithStart = []
    for sentence in tagsOnly:
        newSentence = ["START"]
        for tag in sentence:
            newSentence.append(tag)
        tagsWithStart.append(newSentence)

    #get the set of all possible tags from emission class
    allWordsPos = emission.breakWordsPosTuple(text)
    allTags = []
    for word in allWordsPos:
        allTags.append(word[1])
    allTags = set(allTags)
    allTagsStartSet = ["START"]
    for word in allTags:
        allTagsStartSet.append(word)

    #make frequencyTable. rows = allTagsStartSet, columns = allTags
    #add smooth .1
    frequencyTable = {}
    for tag in allTagsStartSet:
        frequencyTable[tag] = {}
        for t in allTags:
            frequencyTable[tag][t] = .1

    #fill table with frequency values.
    for sentence in tagsWithStart:
        for i in range(len(sentence) - 1):
            i = i + 1
            frequencyTable[sentence[i - 1]][sentence[i]] += 1

    r = open("README.txt", "a")

    #write values to readme for frequencyTable
    r.write("\n")
    r.write("FREQUENCY TABLE\n")
    r.write(" " * 10)
    for i in allTags:
        r.write("{0:<7}".format(i))
    r.write("\n")
    for i in frequencyTable:
        r.write("{0:>5}".format(i))
        for tag in frequencyTable[i]:
            r.write("{0:>7}".format(frequencyTable[i][tag]))
        r.write("\n")

    #make individual tag frequency count
    individualTagCount = {}
    for i in allTagsStartSet:
        individualTagCount[i] = 0

    for sentence in tagsWithStart:
        for tag in sentence:
            individualTagCount[tag] += 1

    #make transition table
    transitionTable = {}
    for tag in allTagsStartSet:
        transitionTable[tag] = {}
        for t in allTags:
            transitionTable[tag][t] = 0

    #fill transition table. Probability(tagi|tagi-1) = Count(tagi-1, tagi) +.1/
    #Count(tagi-1) + (.1 * len(allTagsStartSet)). already added numerator
    #smooth.
    for tag in allTagsStartSet:
        for t in allTags:
            transitionTable[tag][t] = round(
                frequencyTable[tag][t] /
                ((.1 * len(allTagsStartSet)) + individualTagCount[tag]), 2)

    r.write("\n")
    r.write("TRANSMISSION TABLE (with smoothing equation)\n")
    r.write(" " * 10)
    for i in allTags:
        r.write("{0:<7}".format(i))
    r.write("\n")
    for i in transitionTable:
        r.write("{0:>5}".format(i))
        for tag in transitionTable[i]:
            r.write("{0:>7}".format(transitionTable[i][tag]))
        r.write("\n")

    r.close()
    f.close()
    return transitionTable, allTags
Beispiel #15
0
def generate_model():
    """Creates the statistical model for POS tagging"""
    brackets = ['[', ']']

    # conditional freq of each word given a tag
    tag_cfdist = ConditionalFreqDist()

    # conditional freq of each tag given another tag
    tag_bigrams = None

    # map of words to tags
    tag_dict = defaultdict(lambda: [])

    # final result, will be printed to file
    output = ''

    # model creation from training data
    with open(sys.argv[1], 'r') as file_:
        # word-tag pairs
        pairs = tuple(
            str2tuple(token.split('|')[0]) for token in file_.read().split()
            if token not in brackets)

        for word, tag in pairs:
            tag_dict[word].append(tag)
            tag_cfdist[tag][word] += 1

        tag_bigrams = ConditionalFreqDist(bigrams(pair[1] for pair in pairs))

    test_data = []
    with open(sys.argv[2], 'r') as file_:
        test_data = file_.read().splitlines()

    # guess plausible tag for first word
    prev_tag = 'VB'
    for line in test_data:
        for word in line.split():
            if word == brackets[0]:
                output += '[ '
            elif word == brackets[1]:
                output += ']'
            else:
                # assign 'NN' (noun) to unknown words
                if not tag_dict[word]:
                    if word[-1] == 's':
                        prev_tag = 'NNS'
                    elif word[-2:] == 'ed':
                        prev_tag = 'VBN'
                    elif word[0].isupper():
                        prev_tag = 'NNP'
                    elif word[-4:] == 'able' or '-' in word:
                        prev_tag = 'JJ'
                    else:
                        prev_tag = 'NN'

                    output += f'{word}/{prev_tag} '
                    continue

                argmax = 0
                best_tag = ''

                # find the most probable tag
                # formula = P(word|tag) * P(tag|prev_tag)
                for tag in tag_dict[word]:
                    prob = \
                        tag_cfdist[tag].freq(word) \
                        * tag_bigrams[prev_tag].freq(tag)
                    if prob >= argmax:
                        argmax = prob
                        best_tag = tag

                output += f'{word}/{best_tag} '
                prev_tag = best_tag

        # add newline char after each line
        output += '\n'

    print(output)
Beispiel #16
0
def make_dict(raw_array, tagged_array, result_dic, bigram_dic):
    err_bi = {}
    with open("bigramerr.txt", 'r') as f:
        for i in f.readlines():
            err_bi[''.join(i.split()[:-1])] = 1
    print(err_bi)

    for raw_sent, tagged_sent in zip(raw_array, tagged_array):

        flag = 0
        collect_bigram = "START"
        if not len(raw_sent) == len(tagged_sent):
            continue

        for raw_word, tag_word in zip(raw_sent, tagged_sent):

            if "NA" in tag_word:
                flag = 1
                continue
            if collect_bigram != "START":
                collect_bigram += "+@@SP@@"
            tag_morph = re.split("(?<=/[A-Z]{2})\+|(?<=/[A-Z]{3})\+", tag_word)

            fraction = []
            tagged = ''.join(
                [morph_pos[:morph_pos.rfind('/')] for morph_pos in tag_morph])

            if raw_word == tagged:
                for morph in tag_morph:
                    pyocheung, postag = nltk.str2tuple(morph)
                    collect_bigram = collect_bigram + "+" + postag
                    # print_errer('·', '·', "SF", str(pyocheung), pyocheung, postag, raw_word, tag_word)
                    # print_errer('·', '·', "SS", str(pyocheung), pyocheung, postag, raw_word, tag_word)
                    # print_errer('·', '-', "SO", str(pyocheung), pyocheung, postag, raw_word, tag_word)
                    # print_errer('·', '.', "SP", str(pyocheung), pyocheung, postag, raw_word, tag_word)
                    # print_errer('·', '·', "SW", str(pyocheung), pyocheung, postag, raw_word, tag_word)
                    if "SH" in postag:
                        continue
                    if "SL" in postag:
                        continue
                    if "SN" in postag:
                        continue
                    count_dict(result_dic, str(pyocheung), [pyocheung, postag])
                continue

            for morph_tag in tag_morph:
                morph, tag = nltk.str2tuple(morph_tag)

                for syl in morph:
                    fraction.append([syl, tag])
                fraction[-1][1] = fraction[-1][1] + "+"  ##태그 뒤에 +붙이기
            fraction[-1][1] = fraction[-1][1][:-1]

            SM = SequenceMatcher(None, raw_word, tagged)
            if include_delete(SM):
                blocks = make_del_block(fraction, raw_word, tagged)
            else:
                mat_blocks = SM.get_matching_blocks()
                if len(mat_blocks) == 1:  #온 오/vx+ㄴ/etm 혹시 모를 다틀린 형태.
                    postag = '+'.join([
                        morph_pos[morph_pos.rfind('/') + 1:]
                        for morph_pos in tag_morph
                    ])
                    collect_bigram = collect_bigram + "+" + postag
                    if "SH" in postag:
                        continue
                    if "SL" in postag:
                        continue
                    if "SN" in postag:
                        continue
                    # print_errer('·', '·', "SF", str(raw_word), tagged, postag, raw_word, tag_word)
                    # print_errer('·', '·', "SS", str(raw_word), tagged, postag, raw_word, tag_word)
                    # print_errer('·', '-', "SO", str(raw_word), tagged, postag, raw_word, tag_word)
                    # print_errer('·', '.', "SP", str(raw_word), tagged, postag, raw_word, tag_word)
                    # print_errer('·', '·', "SW", str(raw_word), tagged, postag, raw_word, tag_word)
                    count_dict(result_dic, str(raw_word), [tagged, postag])
                    continue
                blocks = generate_block(fraction, mat_blocks)
            raw_temp = ''
            tagged_temp = ''
            for i in blocks:
                raw_temp += raw_word[i[0]:i[1]]
                tagged_temp += tagged[i[2]:i[3]]
            if raw_word != raw_temp:
                print("로우이상")
                flag = 1

            if tagged != tagged_temp:  #이건 이이건등등 VCP

                flag = 1

            result = []

            for cur, nxt in pairwise(blocks):
                raw = raw_word[cur[0]:cur[1]]
                mor = tagged[cur[2]:cur[3]]
                postag = "/".join(fraction[tag_num][1]
                                  for tag_num in range(cur[2], cur[3]))
                post_loc = cur[2]
                if cur[1] != nxt[0] and cur[3] != nxt[2]:
                    raw = raw_word[cur[1]:nxt[0]]
                    mor = tagged[cur[3]:nxt[2]]
                    postag = "/".join(fraction[tag_num][1]
                                      for tag_num in range(cur[3], nxt[2]))
                    post_loc = cur[3]
                post_tag_list = del_dup(postag)
                if len(result) != 0 and mark_attach(result[-1][2][-1],
                                                    fraction[post_loc][1]):
                    result[-1][2][-1] = result[-1][2][-1] + __pre_mark
                    post_tag_list[0] = __post_mark + post_tag_list[0]

                result.append([raw, mor, post_tag_list])
            for data in result:
                tags = data[2]
                tag_list = [remove_plus(tag) for tag in tags]
                # if "SH" in tag_list:
                #     continue
                # if "SL" in tag_list:
                #     continue
                # if "SN" in tag_list:
                #     continue
                postag_result = "+".join(tag_list)
                collect_bigram = collect_bigram + "+" + postag_result
                # print_errer('·','·',"SF",str(data[0]),data[1],postag_result,raw_word,tag_word)
                # print_errer('·', '·', "SS", str(data[0]), data[1], postag_result, raw_word, tag_word)
                # print_errer('·', '-', "SO", str(data[0]), data[1], postag_result, raw_word, tag_word)
                # print_errer('·', '.', "SP", str(data[0]), data[1], postag_result, raw_word, tag_word)
                # print_errer('·', '·', "SW", str(data[0]), data[1], postag_result, raw_word, tag_word)
                count_dict(result_dic, str(data[0]), [data[1], postag_result])
            # for cur, nxt in pairwise(collect_bigram.split('+')):
            #     if cur + nxt == ">NNG@@SP@@":
            #         print(cur + nxt)
            #         print(raw_word, tag_word)
            #     if cur + nxt == "SONR":
            #         print(cur + nxt)
            #         print(raw_word, tag_word)
            #     if cur + nxt == "@@SP@@XSA":
            #         print(cur + nxt)
            #         print(raw_word, tag_word)
            #     if cur + nxt == "NNGVCN":
            #         print(cur + nxt)
            #         print(raw_word, tag_word)
            #     if cur + nxt == "XSVETN<":
            #         print(cur + nxt)
            #         print(raw_word, tag_word)
            #     if cur + nxt == "MMJKB":
            #         print(cur + nxt)
            #         print(raw_word, tag_word)
            #     if cur + nxt == "JCSF":
            #         print(cur + nxt)
            #         print(raw_word, tag_word)
            #     if cur + nxt == "ETMEC":
            #         print(cur + nxt)
            #         print(raw_word, tag_word)
            #     if cur + nxt == "JXVCN":
            #         print(cur + nxt)
            #         print(raw_word, tag_word)
            #     if cur + nxt == "SWMAJ":
            #         print(cur + nxt)
            #         print(raw_word, tag_word)

        if flag == 0:
            collect_bigram += "+END"
            for cur_t, nxt_t in pairwise(collect_bigram.split('+')):
                if err_bi.get(cur_t + nxt_t) != None:
                    print(cur_t + "+" + nxt_t)
                    print(raw_sent, tagged_sent)

            make_bigram(bigram_dic, collect_bigram)
    # print(result_dic.get('.'))

    return result_dic, bigram_dic
Beispiel #17
0
conll2000.chunked_sents()
# nltk的正则表达式,有单独含义,可以通过词性进行分块,所以前提就是词性标记已经完成,词性标记是前面章节的内容
# NN 常用名词 单数形式
#
# NNS 常用名词 复数形式
#
# NNP 专有名词 单数形式
#
# NNPS 专有名词 复数形式

grammer = r"NP:{<JJ|CD|DT><JJ>?<NNS>}"  # 开头是JJ,或者CD,或者DT,注意<JJ|CD|DT><JJ>的写法,<>是nltk的不是正则表达式的
import nltk

rp = nltk.RegexpParser(grammer)
sentens = ["many/JJ", "researchers/NNS", "two/CD", "weeks/NNS", "both/DT", "new/JJ", "positions/NNS"]
sents = [nltk.str2tuple(str) for str in sentens]
rp.parse(sents)
"""
3. 选择CoNLL-2000分块语料库中三种块类型之一。查看这些数据, 并尝试观察组成这
种类型的块的 POS标记序列的任一模式。 使用正则表达式分块器nltk.RegexpParser
开发一个简单的分块器。讨论任何难以可靠分块的标记序列。
"""
# 查看三种语料库之一,我们选择NP类型
from nltk.corpus import conll2000

tree = conll2000.chunked_sents(chunk_types=['NP'])
tree.draw()
"""
4. 块的早期定义是出现在缝隙之间的材料。开发一个分块器以将完整的句子作为一个单
独的块开始, 然后其余的工作完全由加缝隙完成。 在你自己的应用程序的帮助下, 确定
哪些标记( 或标记序列)最有可能组成缝隙。 相对于完全基于块规则的分块器比较这种
Beispiel #18
0
def prep_scan(nb_words=None, skip_top=0,
              maxlen=None, test_split=0.2, seed=113,
              start_char=1, oov_char=2, index_from=3):

    from nltk import str2tuple

    with open("Data/CLFL_all_data.txt", "r") as f:
        raw_data = f.read()

    # separate sylls and labels and reject WBY
    data = [str2tuple(x) for x in raw_data.split()]
    data_lines = [[str2tuple(x) for x in line.split()]
                  for line in raw_data.split('\n')]
    data_lines = [[tup for tup in line if tup[0] != "WBY"] for line in
                  data_lines]

    # sylls to IDs
    sylls = [x[0] for x in data]
    sylls_lines = [[x[0] for x in line] for line in data_lines]
    sylls_set = list(set(sylls))
    sylls_ids = {}
    rev_sylls_ids = {}
    for i, x in enumerate(sylls_set):
        sylls_ids[x] = i + 1  # so we can pad with 0s
        rev_sylls_ids[i + 1] = x

    # labels to IDs
    tags = [x[1] for x in data]
    tags_lines = [[x[1] for x in line] for line in data_lines]
    tags_set = list(set(tags))
    print(len(tags_set))
    tags_ids = {}
    rev_tags_ids = {}
    for i, x in enumerate(tags_set):
        tags_ids[x] = i + 1  # so we can pad with 0s
        rev_tags_ids[i + 1] = x

    # lines of syll IDs
    all_sylls_ids = []
    for line in sylls_lines:
        s_l = [sylls_ids[x] for x in line]
        all_sylls_ids.append(s_l)

    # lines of label IDs
    all_tags_ids = []
    for line in tags_lines:
        t_l = [tags_ids[x] for x in line]
        all_tags_ids.append(t_l)

    X, labels = all_sylls_ids, all_tags_ids
    maxlen = len(max(labels, key=len))  # longest line in items

    # train and test split
    X_train = np.array(X[:int(len(X) * (1 - test_split))])
    y_train = np.array(labels[:int(len(X) * (1 - test_split))])

    X_test = np.array(X[int(len(X) * (1 - test_split)):])
    y_test = np.array(labels[int(len(X) * (1 - test_split)):])

    # pad with 0s
    print("Pad sequences (samples x time)")
    X_train = sequence.pad_sequences(X_train, value=0.)  # must be float
    X_test = sequence.pad_sequences(X_test, value=0.)

    print('X_train shape:', X_train.shape)
    print('X_test shape:', X_test.shape)

    # need to pad y too, because more than 1 output value
    y_train = sequence.pad_sequences(np.array(y_train), value=0.)
    y_test = sequence.pad_sequences(np.array(y_test), value=0.)

    y_train = [np_utils.to_categorical(y) for y in y_train]
    y_test = [np_utils.to_categorical(y) for y in y_test]

    # create 3D array for keras multi-classfication
    new_y_train = []
    for array in y_train:
        if len(array[0]) < 10:
            to_add = 10 - len(array[0])
            new_y_train.append(np.hstack((array, np.zeros((array.shape[0],
                                                           to_add)))))
        else:
            new_y_train.append(array)

    y_train = np.asarray(new_y_train)

    # create 3D array for keras multi-classfication
    new_y_test = []
    for array in y_test:
        if len(array[0]) < 10:
            to_add = 10 - len(array[0])
            new_y_test.append(np.hstack((array, np.zeros((array.shape[0],
                                                          to_add)))))
        else:
            new_y_test.append(array)

    y_test = np.asarray(new_y_test)

    print('y_train shape:', y_train.shape)
    print('y_test shape:', y_test.shape)

    return ((X_train, y_train), (X_test, y_test), maxlen, rev_sylls_ids,
            rev_tags_ids)
Beispiel #19
0
def tagged_text(text):
    return [nltk.str2tuple(w) for w in text.split()]
from pandas import DataFrame
import nltk
import numpy as np

infile = "JurafskyMartinHmmDecode.xlsx"

Apandas = pandas.read_excel(infile, sheetname="Transitions")
#print Apandas
rownames = Apandas.index.tolist()
A = np.array(Apandas)

Bpandas = pandas.read_excel(infile, 'ObsLikelihood')
#print Bpandas
B = np.array(Bpandas)
statenames = Bpandas.index.tolist()

trans = A[1:, :]
pi = np.expand_dims(np.array(A[0, :]), 1)
decoder = viterbi.Decoder(pi, trans, B)
""" do the decoding """
states = decoder.Decode(np.arange(5))
result = np.array(statenames)[states].tolist()
sentence = Bpandas.columns.tolist()
resultTagged = zip(sentence, result)

correct = ' Janet/NNP will/MD back/VB the/DT bill/NN'
correct = [nltk.str2tuple(x) for x in correct.split()]
assert (resultTagged == correct)

print "PASSED"
Beispiel #21
0
infile="JurafskyMartinHmmDecode.xlsx"


Apandas = pandas.read_excel(infile,sheetname="Transitions") 
#print Apandas
rownames = Apandas.index.tolist()
A=np.array(Apandas)

Bpandas = pandas.read_excel(infile,'ObsLikelihood')
#print Bpandas
B=np.array(Bpandas)
statenames = Bpandas.index.tolist()


trans=A[1:,:]
pi=np.expand_dims(np.array(A[0,:]),1)
decoder = viterbi.Decoder(pi,trans, B)

""" do the decoding """
states =  decoder.Decode(np.arange(5))
result = np.array(statenames)[states].tolist()
sentence = Bpandas.columns.tolist()
resultTagged = zip(sentence,result)

correct=' Janet/NNP will/MD back/VB the/DT bill/NN'
correct=[nltk.str2tuple(x) for x in correct.split()]
assert (resultTagged==correct)

print "PASSED"
import sys
import nltk
from libraries import files

if len(sys.argv) >= 2:
    FILE = sys.argv[1]
    reviews = files.read_from_xml(FILE)
    count = 0
    for review in reviews:
        text = review.content.split()
        for token in text:
            str_token = nltk.str2tuple(token, '/')
            if str_token[1] is None or str_token[1] == '':
                print str_token
                count += 1

    print count
else:
    print 'Invalid command.Please use the format:\n python check-missing-tags.py <filename>'
    sys.exit()