def clean_googlengram(line): """Removes speechtags from line specific to the googlengram module Param: line (unicode) Returns: line (unicode) """ return_line = line.split("\t")[ 0] # Get the ngram, remove year, counter, etc clean = [] words = WhitespaceTokenizer().tokenize(return_line) for word in words: # in >1-grams transitions to specific tags are written as: # The_ADJ _NOUN_ (meaning from The there is a transition to a noun # We remove those if word[0] != '_' and word[-1] != '_': # Split the token and the tag based on the '_' token, tag = str2tuple(word, '_') # Punct will be added using rules. if len(token) > 1: if tag != 'PUNCT' or tag != '.' or tag != '': clean.append(token) elif token not in punctuation: clean.append(token) return_line = ' '.join(clean) if return_line != line: return True, return_line else: return False, line
def align(sent): tagged = re.split(r'\s+', sent) raw_word = tagged[0] tagged[1] = re.compile(r'__[0-9]+').sub('', tagged[1]) tag_morph = re.split("(?<=/[A-Z]{2})\+|(?<=/[A-Z]{3})\+", tagged[1]) tagged = ''.join([morph_pos[:morph_pos.rfind('/')] for morph_pos in tag_morph]) fraction = list() for morph_tag in tag_morph: morph, tag = nltk.str2tuple(morph_tag) for i, syl in enumerate(morph): if i == 0: fraction.append([syl, "B-"+tag]) else: fraction.append([syl, "I-" + tag]) fraction[-1][1] = fraction[-1][1] + "+" ##태그 뒤에 +붙이기 fraction[-1][1] = fraction[-1][1][:-1] print(raw_word,tagged) if raw_word == tagged: return fraction SM = SequenceMatcher(None, raw_word, tagged) blocks = list() if include_delete(SM): blocks = make_del_block(fraction, raw_word, tagged) else: mat_blocks = SM.get_matching_blocks() blocks = generate_block(fraction, mat_blocks) if len(mat_blocks) == 1:# 온 오/vx+ㄴ/etm 혹시 모를 다틀린 형태. blocks = make_del_block(fraction, raw_word, tagged) print(blocks) for cur, nxt in pairwise(blocks): raw = raw_word[cur[0]:cur[1]] mor = tagged[cur[2]:cur[3]] print(raw,mor)
def lex_smooth(self, data): """Smoothing of the whole language model""" p_smoothed = defaultdict(float) for token in data: word, tag = str2tuple(token) p_smoothed[(word, tag)] = self.get_probs(word, tag) return p_smoothed
def unpack_keyword(item): val, vty, pty = item if analyse_pos: word, pos = nltk.str2tuple(val) else: word = val pos = "N/A" return (word, pos, vty*pty)
def tagged_corpus(filename): """ Lazily read tagged corpus from a file `filename` """ try: with open(filename, "r") as source: for line in source: (tokens, tags) = zip(*(str2tuple(wt) for wt in line.split())) yield (list(tokens), list(tags)) except IOError as error: exit(error)
def parser_mystr2tuple(s,minLength=1): output_list=[] if len(s)>4: s=s.replace("(u'","('") s=s[2:len(s)-2] s=s.replace("'","") lst=s.split("), (") for i in range(0,len(lst)): word, tag = lst[i].split(", ") if len(word)>=minLength or tag=='CD': output_list.append(nltk.str2tuple(word+"/"+tag)) return output_list
def parser_mystr2tuple(s, minLength=1): output_list = [] if len(s) > 4: s = s.replace("(u'", "('") s = s[2:len(s) - 2] s = s.replace("'", "") lst = s.split("), (") for i in range(0, len(lst)): word, tag = lst[i].split(", ") if len(word) >= minLength or tag == 'CD': output_list.append(nltk.str2tuple(word + "/" + tag)) return output_list
def split_syn(s1, s2): result = [] raw = ''.join(s1) tag_morph = re.split("(?<=/[A-Z]{2})\+|(?<=/[A-Z]{3})\+", ''.join(s2)) tagged = ''.join( [morph_pos[:morph_pos.rfind('/')] for morph_pos in tag_morph]) fraction = [] for morph_tag in tag_morph: morph, tag = nltk.str2tuple(morph_tag) for mor_i in range(len(morph)): if mor_i == 0: fraction.append([morph[mor_i], "B-" + tag]) else: fraction.append([morph[mor_i], "I-" + tag]) fraction[-1][1] = fraction[-1][1] + "+" ##태그 뒤에 +붙이기 fraction[-1][1] = fraction[-1][1][:-1] if raw == tagged: temp = [] for i in range(len(fraction)): temp.append(raw[i]) temp.extend(fraction[i]) result.append(temp) temp = [] return result blocks = make_del_block(fraction, raw, tagged) for block in blocks[:-1]: raw_b, raw_e, tag_b, tag_e = block[0], block[1], block[2], block[3] temp = [] if raw_e - raw_b == tag_e - tag_b: for i in range(raw_e - raw_b): temp.append(raw[raw_b + i]) temp.extend(fraction[tag_b + i]) result.append(temp) temp = [] elif raw_e - raw_b == 1: for i in range(tag_e - tag_b): temp.extend(fraction[tag_b + i]) tag_syn = ''.join([temp[i] for i in range(0, len(temp), 2)]) tag = ''.join([temp[i] for i in range(1, len(temp), 2)]) result.append([raw[raw_b], tag_syn, tag]) return result
def from_str(cls, string): (tokens, tags) = zip(*(str2tuple(tt) for tt in string.split())) return cls(tokens, tags)
How/WRB much/JJ did/VBZ ganguly/NNP hit/VBZ 6s/CD in/IN match/NN 1/CD ?/. How/WRB much/JJ did/VBZ BB/NNP McCullum/NNP hit/VBZ 6s/CD in/IN match/NN 1CD/ ?/. 6s/CD hit/VBZ by/IN BB/NNP McCullum/NNP match/NN 1CD/ ?/. 4s/CD hit/VBZ by/IN BB/NNP McCullum/NNP match/NN 1CD/ ?/. How/WRB much/JJ did/VBZ sc/NNP ganguly/NNP hit/VBZ 4s/CD in/IN match/NN 1/CD ?/. How/WRB much/JJ did/VBZ ganguly/NNP hit/VBZ 4/CD in/IN match/NN 1/CD ?/. How/WRB much/JJ did/VBZ BB/NNP McCullum/NNP hit/VBZ 4s/CD in/IN match/NN 1CD/ ?/. how/WRB many/JJ balls/NN were/VBZ faced/VBZ by/IN BB/NNP McCullum/NNP in/IN match/NN 3/CD ?/. how/WRB many/JJ deliveries/NN were/VBZ faced/VBZ by/IN BB/NNP Mccullum/NNP in/IN match/NN 3/CD ?/. """ # In[90]: tagged_question = [] for word in word_tokenize(corrected_train): tagged_question.append(nltk.str2tuple(word)) #print(tagged_question) train_data = [] train_data.append(tagged_question) # In[91]: from nltk.data import load pos_tag = load('taggers/maxent_treebank_pos_tagger/english.pickle') # In[92]: from nltk.tag import SequentialBackoffTagger class POSTagger(SequentialBackoffTagger):
t1 = nltk.UnigramTagger(x_train, backoff=t0) t2 = nltk.BigramTagger(x_train, backoff=t1) print (t2.evaluate(x_test)) #0.863 #基于Unigram训练一个中文词性标注器,语料使用网上可以下载得到的人民日报98年1月的标注资料 import nltk import json lines = open('词性标注人民日报.txt',encoding='utf-8').readlines() all_tagged_sents = [] for line in lines: sent = line.split() tagged_sent = [] for item in sent: pair = nltk.str2tuple(item) tagged_sent.append(pair) if len(tagged_sent)>0: all_tagged_sents.append(tagged_sent) train_size = int(len(all_tagged_sents)*0.8) x_train = all_tagged_sents[:train_size] x_test = all_tagged_sents[train_size:] tagger = nltk.UnigramTagger(train=x_train,backoff=nltk.DefaultTagger('n')) tokens = nltk.word_tokenize(u'我 认为 不丹 的 被动 卷入 不 构成 此次 对峙 的 主要 因素。') tagged = tagger.tag(tokens) #["我", "R"], ["认为", "V"], ["不丹", "n"], ["的", "U"], ["被动", "A"], ["卷入", "V"], ["不", "D"], ["构成", "V"], ["此次", "R"], ["对峙", "V"], ["的", "U"], ["主要", "B"], ["因素。", "n"] print (tagger.evaluate(x_test)) #0.871
def prep_scan(nb_words=None, skip_top=0, maxlen=None, test_split=0.2, seed=113, start_char=1, oov_char=2, index_from=3): from nltk import str2tuple with open("Data/CLFL_all_data.txt", "r") as f: raw_data = f.read() # separate sylls and labels and reject WBY data = [str2tuple(x) for x in raw_data.split()] data_lines = [[str2tuple(x) for x in line.split()] for line in raw_data.split('\n')] data_lines = [[tup for tup in line if tup[0] != "WBY"] for line in data_lines] # sylls to IDs sylls = [x[0] for x in data] sylls_lines = [[x[0] for x in line] for line in data_lines] sylls_set = list(set(sylls)) sylls_ids = {} rev_sylls_ids = {} for i, x in enumerate(sylls_set): sylls_ids[x] = i + 1 # so we can pad with 0s rev_sylls_ids[i + 1] = x # labels to IDs tags = [x[1] for x in data] tags_lines = [[x[1] for x in line] for line in data_lines] tags_set = list(set(tags)) print(len(tags_set)) tags_ids = {} rev_tags_ids = {} for i, x in enumerate(tags_set): tags_ids[x] = i + 1 # so we can pad with 0s rev_tags_ids[i + 1] = x # lines of syll IDs all_sylls_ids = [] for line in sylls_lines: s_l = [sylls_ids[x] for x in line] all_sylls_ids.append(s_l) # lines of label IDs all_tags_ids = [] for line in tags_lines: t_l = [tags_ids[x] for x in line] all_tags_ids.append(t_l) X, labels = all_sylls_ids, all_tags_ids maxlen = len(max(labels, key=len)) # longest line in items # train and test split X_train = np.array(X[:int(len(X) * (1 - test_split))]) y_train = np.array(labels[:int(len(X) * (1 - test_split))]) X_test = np.array(X[int(len(X) * (1 - test_split)):]) y_test = np.array(labels[int(len(X) * (1 - test_split)):]) # pad with 0s print("Pad sequences (samples x time)") X_train = sequence.pad_sequences(X_train, value=0.) # must be float X_test = sequence.pad_sequences(X_test, value=0.) print('X_train shape:', X_train.shape) print('X_test shape:', X_test.shape) # need to pad y too, because more than 1 output value y_train = sequence.pad_sequences(np.array(y_train), value=0.) y_test = sequence.pad_sequences(np.array(y_test), value=0.) y_train = [np_utils.to_categorical(y) for y in y_train] y_test = [np_utils.to_categorical(y) for y in y_test] # create 3D array for keras multi-classfication new_y_train = [] for array in y_train: if len(array[0]) < 10: to_add = 10 - len(array[0]) new_y_train.append( np.hstack((array, np.zeros((array.shape[0], to_add))))) else: new_y_train.append(array) y_train = np.asarray(new_y_train) # create 3D array for keras multi-classfication new_y_test = [] for array in y_test: if len(array[0]) < 10: to_add = 10 - len(array[0]) new_y_test.append( np.hstack((array, np.zeros((array.shape[0], to_add))))) else: new_y_test.append(array) y_test = np.asarray(new_y_test) print('y_train shape:', y_train.shape) print('y_test shape:', y_test.shape) return ((X_train, y_train), (X_test, y_test), maxlen, rev_sylls_ids, rev_tags_ids)
def calcTransition(text): f = open(text, "r") sentenceList = [] for line in f: sentenceList.append(line) #print(sentenceList) #split into words/tags only sentenceListSplit = [] for line in sentenceList: sentenceListSplit.append(line.split()) #print(sentenceListSplit) #split into word/tags only for sentences sentenceListWordTags = [] for sentence in sentenceListSplit: sentences = [] for word in sentence: sentences.append(nl.str2tuple(word)) sentenceListWordTags.append(sentences) tagsOnly = [] for sentence in sentenceListWordTags: sentenceTags = [] for tup in sentence: sentenceTags.append(tup[1]) tagsOnly.append(sentenceTags) #make list with start tag for rows table tagsWithStart = [] for sentence in tagsOnly: newSentence = ["START"] for tag in sentence: newSentence.append(tag) tagsWithStart.append(newSentence) #get the set of all possible tags from emission class allWordsPos = emission.breakWordsPosTuple(text) allTags = [] for word in allWordsPos: allTags.append(word[1]) allTags = set(allTags) allTagsStartSet = ["START"] for word in allTags: allTagsStartSet.append(word) #make frequencyTable. rows = allTagsStartSet, columns = allTags #add smooth .1 frequencyTable = {} for tag in allTagsStartSet: frequencyTable[tag] = {} for t in allTags: frequencyTable[tag][t] = .1 #fill table with frequency values. for sentence in tagsWithStart: for i in range(len(sentence) - 1): i = i + 1 frequencyTable[sentence[i - 1]][sentence[i]] += 1 r = open("README.txt", "a") #write values to readme for frequencyTable r.write("\n") r.write("FREQUENCY TABLE\n") r.write(" " * 10) for i in allTags: r.write("{0:<7}".format(i)) r.write("\n") for i in frequencyTable: r.write("{0:>5}".format(i)) for tag in frequencyTable[i]: r.write("{0:>7}".format(frequencyTable[i][tag])) r.write("\n") #make individual tag frequency count individualTagCount = {} for i in allTagsStartSet: individualTagCount[i] = 0 for sentence in tagsWithStart: for tag in sentence: individualTagCount[tag] += 1 #make transition table transitionTable = {} for tag in allTagsStartSet: transitionTable[tag] = {} for t in allTags: transitionTable[tag][t] = 0 #fill transition table. Probability(tagi|tagi-1) = Count(tagi-1, tagi) +.1/ #Count(tagi-1) + (.1 * len(allTagsStartSet)). already added numerator #smooth. for tag in allTagsStartSet: for t in allTags: transitionTable[tag][t] = round( frequencyTable[tag][t] / ((.1 * len(allTagsStartSet)) + individualTagCount[tag]), 2) r.write("\n") r.write("TRANSMISSION TABLE (with smoothing equation)\n") r.write(" " * 10) for i in allTags: r.write("{0:<7}".format(i)) r.write("\n") for i in transitionTable: r.write("{0:>5}".format(i)) for tag in transitionTable[i]: r.write("{0:>7}".format(transitionTable[i][tag])) r.write("\n") r.close() f.close() return transitionTable, allTags
def generate_model(): """Creates the statistical model for POS tagging""" brackets = ['[', ']'] # conditional freq of each word given a tag tag_cfdist = ConditionalFreqDist() # conditional freq of each tag given another tag tag_bigrams = None # map of words to tags tag_dict = defaultdict(lambda: []) # final result, will be printed to file output = '' # model creation from training data with open(sys.argv[1], 'r') as file_: # word-tag pairs pairs = tuple( str2tuple(token.split('|')[0]) for token in file_.read().split() if token not in brackets) for word, tag in pairs: tag_dict[word].append(tag) tag_cfdist[tag][word] += 1 tag_bigrams = ConditionalFreqDist(bigrams(pair[1] for pair in pairs)) test_data = [] with open(sys.argv[2], 'r') as file_: test_data = file_.read().splitlines() # guess plausible tag for first word prev_tag = 'VB' for line in test_data: for word in line.split(): if word == brackets[0]: output += '[ ' elif word == brackets[1]: output += ']' else: # assign 'NN' (noun) to unknown words if not tag_dict[word]: if word[-1] == 's': prev_tag = 'NNS' elif word[-2:] == 'ed': prev_tag = 'VBN' elif word[0].isupper(): prev_tag = 'NNP' elif word[-4:] == 'able' or '-' in word: prev_tag = 'JJ' else: prev_tag = 'NN' output += f'{word}/{prev_tag} ' continue argmax = 0 best_tag = '' # find the most probable tag # formula = P(word|tag) * P(tag|prev_tag) for tag in tag_dict[word]: prob = \ tag_cfdist[tag].freq(word) \ * tag_bigrams[prev_tag].freq(tag) if prob >= argmax: argmax = prob best_tag = tag output += f'{word}/{best_tag} ' prev_tag = best_tag # add newline char after each line output += '\n' print(output)
def make_dict(raw_array, tagged_array, result_dic, bigram_dic): err_bi = {} with open("bigramerr.txt", 'r') as f: for i in f.readlines(): err_bi[''.join(i.split()[:-1])] = 1 print(err_bi) for raw_sent, tagged_sent in zip(raw_array, tagged_array): flag = 0 collect_bigram = "START" if not len(raw_sent) == len(tagged_sent): continue for raw_word, tag_word in zip(raw_sent, tagged_sent): if "NA" in tag_word: flag = 1 continue if collect_bigram != "START": collect_bigram += "+@@SP@@" tag_morph = re.split("(?<=/[A-Z]{2})\+|(?<=/[A-Z]{3})\+", tag_word) fraction = [] tagged = ''.join( [morph_pos[:morph_pos.rfind('/')] for morph_pos in tag_morph]) if raw_word == tagged: for morph in tag_morph: pyocheung, postag = nltk.str2tuple(morph) collect_bigram = collect_bigram + "+" + postag # print_errer('·', '·', "SF", str(pyocheung), pyocheung, postag, raw_word, tag_word) # print_errer('·', '·', "SS", str(pyocheung), pyocheung, postag, raw_word, tag_word) # print_errer('·', '-', "SO", str(pyocheung), pyocheung, postag, raw_word, tag_word) # print_errer('·', '.', "SP", str(pyocheung), pyocheung, postag, raw_word, tag_word) # print_errer('·', '·', "SW", str(pyocheung), pyocheung, postag, raw_word, tag_word) if "SH" in postag: continue if "SL" in postag: continue if "SN" in postag: continue count_dict(result_dic, str(pyocheung), [pyocheung, postag]) continue for morph_tag in tag_morph: morph, tag = nltk.str2tuple(morph_tag) for syl in morph: fraction.append([syl, tag]) fraction[-1][1] = fraction[-1][1] + "+" ##태그 뒤에 +붙이기 fraction[-1][1] = fraction[-1][1][:-1] SM = SequenceMatcher(None, raw_word, tagged) if include_delete(SM): blocks = make_del_block(fraction, raw_word, tagged) else: mat_blocks = SM.get_matching_blocks() if len(mat_blocks) == 1: #온 오/vx+ㄴ/etm 혹시 모를 다틀린 형태. postag = '+'.join([ morph_pos[morph_pos.rfind('/') + 1:] for morph_pos in tag_morph ]) collect_bigram = collect_bigram + "+" + postag if "SH" in postag: continue if "SL" in postag: continue if "SN" in postag: continue # print_errer('·', '·', "SF", str(raw_word), tagged, postag, raw_word, tag_word) # print_errer('·', '·', "SS", str(raw_word), tagged, postag, raw_word, tag_word) # print_errer('·', '-', "SO", str(raw_word), tagged, postag, raw_word, tag_word) # print_errer('·', '.', "SP", str(raw_word), tagged, postag, raw_word, tag_word) # print_errer('·', '·', "SW", str(raw_word), tagged, postag, raw_word, tag_word) count_dict(result_dic, str(raw_word), [tagged, postag]) continue blocks = generate_block(fraction, mat_blocks) raw_temp = '' tagged_temp = '' for i in blocks: raw_temp += raw_word[i[0]:i[1]] tagged_temp += tagged[i[2]:i[3]] if raw_word != raw_temp: print("로우이상") flag = 1 if tagged != tagged_temp: #이건 이이건등등 VCP flag = 1 result = [] for cur, nxt in pairwise(blocks): raw = raw_word[cur[0]:cur[1]] mor = tagged[cur[2]:cur[3]] postag = "/".join(fraction[tag_num][1] for tag_num in range(cur[2], cur[3])) post_loc = cur[2] if cur[1] != nxt[0] and cur[3] != nxt[2]: raw = raw_word[cur[1]:nxt[0]] mor = tagged[cur[3]:nxt[2]] postag = "/".join(fraction[tag_num][1] for tag_num in range(cur[3], nxt[2])) post_loc = cur[3] post_tag_list = del_dup(postag) if len(result) != 0 and mark_attach(result[-1][2][-1], fraction[post_loc][1]): result[-1][2][-1] = result[-1][2][-1] + __pre_mark post_tag_list[0] = __post_mark + post_tag_list[0] result.append([raw, mor, post_tag_list]) for data in result: tags = data[2] tag_list = [remove_plus(tag) for tag in tags] # if "SH" in tag_list: # continue # if "SL" in tag_list: # continue # if "SN" in tag_list: # continue postag_result = "+".join(tag_list) collect_bigram = collect_bigram + "+" + postag_result # print_errer('·','·',"SF",str(data[0]),data[1],postag_result,raw_word,tag_word) # print_errer('·', '·', "SS", str(data[0]), data[1], postag_result, raw_word, tag_word) # print_errer('·', '-', "SO", str(data[0]), data[1], postag_result, raw_word, tag_word) # print_errer('·', '.', "SP", str(data[0]), data[1], postag_result, raw_word, tag_word) # print_errer('·', '·', "SW", str(data[0]), data[1], postag_result, raw_word, tag_word) count_dict(result_dic, str(data[0]), [data[1], postag_result]) # for cur, nxt in pairwise(collect_bigram.split('+')): # if cur + nxt == ">NNG@@SP@@": # print(cur + nxt) # print(raw_word, tag_word) # if cur + nxt == "SONR": # print(cur + nxt) # print(raw_word, tag_word) # if cur + nxt == "@@SP@@XSA": # print(cur + nxt) # print(raw_word, tag_word) # if cur + nxt == "NNGVCN": # print(cur + nxt) # print(raw_word, tag_word) # if cur + nxt == "XSVETN<": # print(cur + nxt) # print(raw_word, tag_word) # if cur + nxt == "MMJKB": # print(cur + nxt) # print(raw_word, tag_word) # if cur + nxt == "JCSF": # print(cur + nxt) # print(raw_word, tag_word) # if cur + nxt == "ETMEC": # print(cur + nxt) # print(raw_word, tag_word) # if cur + nxt == "JXVCN": # print(cur + nxt) # print(raw_word, tag_word) # if cur + nxt == "SWMAJ": # print(cur + nxt) # print(raw_word, tag_word) if flag == 0: collect_bigram += "+END" for cur_t, nxt_t in pairwise(collect_bigram.split('+')): if err_bi.get(cur_t + nxt_t) != None: print(cur_t + "+" + nxt_t) print(raw_sent, tagged_sent) make_bigram(bigram_dic, collect_bigram) # print(result_dic.get('.')) return result_dic, bigram_dic
conll2000.chunked_sents() # nltk的正则表达式,有单独含义,可以通过词性进行分块,所以前提就是词性标记已经完成,词性标记是前面章节的内容 # NN 常用名词 单数形式 # # NNS 常用名词 复数形式 # # NNP 专有名词 单数形式 # # NNPS 专有名词 复数形式 grammer = r"NP:{<JJ|CD|DT><JJ>?<NNS>}" # 开头是JJ,或者CD,或者DT,注意<JJ|CD|DT><JJ>的写法,<>是nltk的不是正则表达式的 import nltk rp = nltk.RegexpParser(grammer) sentens = ["many/JJ", "researchers/NNS", "two/CD", "weeks/NNS", "both/DT", "new/JJ", "positions/NNS"] sents = [nltk.str2tuple(str) for str in sentens] rp.parse(sents) """ 3. 选择CoNLL-2000分块语料库中三种块类型之一。查看这些数据, 并尝试观察组成这 种类型的块的 POS标记序列的任一模式。 使用正则表达式分块器nltk.RegexpParser 开发一个简单的分块器。讨论任何难以可靠分块的标记序列。 """ # 查看三种语料库之一,我们选择NP类型 from nltk.corpus import conll2000 tree = conll2000.chunked_sents(chunk_types=['NP']) tree.draw() """ 4. 块的早期定义是出现在缝隙之间的材料。开发一个分块器以将完整的句子作为一个单 独的块开始, 然后其余的工作完全由加缝隙完成。 在你自己的应用程序的帮助下, 确定 哪些标记( 或标记序列)最有可能组成缝隙。 相对于完全基于块规则的分块器比较这种
def prep_scan(nb_words=None, skip_top=0, maxlen=None, test_split=0.2, seed=113, start_char=1, oov_char=2, index_from=3): from nltk import str2tuple with open("Data/CLFL_all_data.txt", "r") as f: raw_data = f.read() # separate sylls and labels and reject WBY data = [str2tuple(x) for x in raw_data.split()] data_lines = [[str2tuple(x) for x in line.split()] for line in raw_data.split('\n')] data_lines = [[tup for tup in line if tup[0] != "WBY"] for line in data_lines] # sylls to IDs sylls = [x[0] for x in data] sylls_lines = [[x[0] for x in line] for line in data_lines] sylls_set = list(set(sylls)) sylls_ids = {} rev_sylls_ids = {} for i, x in enumerate(sylls_set): sylls_ids[x] = i + 1 # so we can pad with 0s rev_sylls_ids[i + 1] = x # labels to IDs tags = [x[1] for x in data] tags_lines = [[x[1] for x in line] for line in data_lines] tags_set = list(set(tags)) print(len(tags_set)) tags_ids = {} rev_tags_ids = {} for i, x in enumerate(tags_set): tags_ids[x] = i + 1 # so we can pad with 0s rev_tags_ids[i + 1] = x # lines of syll IDs all_sylls_ids = [] for line in sylls_lines: s_l = [sylls_ids[x] for x in line] all_sylls_ids.append(s_l) # lines of label IDs all_tags_ids = [] for line in tags_lines: t_l = [tags_ids[x] for x in line] all_tags_ids.append(t_l) X, labels = all_sylls_ids, all_tags_ids maxlen = len(max(labels, key=len)) # longest line in items # train and test split X_train = np.array(X[:int(len(X) * (1 - test_split))]) y_train = np.array(labels[:int(len(X) * (1 - test_split))]) X_test = np.array(X[int(len(X) * (1 - test_split)):]) y_test = np.array(labels[int(len(X) * (1 - test_split)):]) # pad with 0s print("Pad sequences (samples x time)") X_train = sequence.pad_sequences(X_train, value=0.) # must be float X_test = sequence.pad_sequences(X_test, value=0.) print('X_train shape:', X_train.shape) print('X_test shape:', X_test.shape) # need to pad y too, because more than 1 output value y_train = sequence.pad_sequences(np.array(y_train), value=0.) y_test = sequence.pad_sequences(np.array(y_test), value=0.) y_train = [np_utils.to_categorical(y) for y in y_train] y_test = [np_utils.to_categorical(y) for y in y_test] # create 3D array for keras multi-classfication new_y_train = [] for array in y_train: if len(array[0]) < 10: to_add = 10 - len(array[0]) new_y_train.append(np.hstack((array, np.zeros((array.shape[0], to_add))))) else: new_y_train.append(array) y_train = np.asarray(new_y_train) # create 3D array for keras multi-classfication new_y_test = [] for array in y_test: if len(array[0]) < 10: to_add = 10 - len(array[0]) new_y_test.append(np.hstack((array, np.zeros((array.shape[0], to_add))))) else: new_y_test.append(array) y_test = np.asarray(new_y_test) print('y_train shape:', y_train.shape) print('y_test shape:', y_test.shape) return ((X_train, y_train), (X_test, y_test), maxlen, rev_sylls_ids, rev_tags_ids)
def tagged_text(text): return [nltk.str2tuple(w) for w in text.split()]
from pandas import DataFrame import nltk import numpy as np infile = "JurafskyMartinHmmDecode.xlsx" Apandas = pandas.read_excel(infile, sheetname="Transitions") #print Apandas rownames = Apandas.index.tolist() A = np.array(Apandas) Bpandas = pandas.read_excel(infile, 'ObsLikelihood') #print Bpandas B = np.array(Bpandas) statenames = Bpandas.index.tolist() trans = A[1:, :] pi = np.expand_dims(np.array(A[0, :]), 1) decoder = viterbi.Decoder(pi, trans, B) """ do the decoding """ states = decoder.Decode(np.arange(5)) result = np.array(statenames)[states].tolist() sentence = Bpandas.columns.tolist() resultTagged = zip(sentence, result) correct = ' Janet/NNP will/MD back/VB the/DT bill/NN' correct = [nltk.str2tuple(x) for x in correct.split()] assert (resultTagged == correct) print "PASSED"
infile="JurafskyMartinHmmDecode.xlsx" Apandas = pandas.read_excel(infile,sheetname="Transitions") #print Apandas rownames = Apandas.index.tolist() A=np.array(Apandas) Bpandas = pandas.read_excel(infile,'ObsLikelihood') #print Bpandas B=np.array(Bpandas) statenames = Bpandas.index.tolist() trans=A[1:,:] pi=np.expand_dims(np.array(A[0,:]),1) decoder = viterbi.Decoder(pi,trans, B) """ do the decoding """ states = decoder.Decode(np.arange(5)) result = np.array(statenames)[states].tolist() sentence = Bpandas.columns.tolist() resultTagged = zip(sentence,result) correct=' Janet/NNP will/MD back/VB the/DT bill/NN' correct=[nltk.str2tuple(x) for x in correct.split()] assert (resultTagged==correct) print "PASSED"
import sys import nltk from libraries import files if len(sys.argv) >= 2: FILE = sys.argv[1] reviews = files.read_from_xml(FILE) count = 0 for review in reviews: text = review.content.split() for token in text: str_token = nltk.str2tuple(token, '/') if str_token[1] is None or str_token[1] == '': print str_token count += 1 print count else: print 'Invalid command.Please use the format:\n python check-missing-tags.py <filename>' sys.exit()