def __init__(self, path, sep=u'/'): self.sentences = [] for line in codecs.open(path, 'r', 'utf-8'): tuples = line.strip().split() self.sentences.append({ "words" : [str2tuple(u, sep=sep)[0] for u in tuples], "POS" : [str2tuple(u, sep=sep)[1] for u in tuples] })
def __init__(self, path, sep=u'/'): self.sentences = [] with open(path, 'r') as f: while True: line = f.readline() if not line: break tuples = line.strip().split() self.sentences.append({ "words": [str2tuple(u, sep=sep)[0] for u in tuples], "POS": [str2tuple(u, sep=sep)[1] for u in tuples] })
def process_paston_data(filename): with open(filename) as file: raw_text = file.read()#load raw file letters_temp = raw_text.split('<Q')#split into letters based on <Q text letters_temp.pop(0) #remove the first item, which is blank letters = ["<Q"+i for i in letters_temp]#put the <Q back on letters2 = [i.splitlines() for i in letters]#split into lines letters3 = [i[8::6] for i in letters2]#select the lines which correspond to letters for letter in letters3: for index, sentence in enumerate(letter): letter[index] = sentence.split()#splits each sentence into word_POS chunks for l in letters3: for s in l: if self.windowsize == 0: s.append("ENDPAD_END") s.insert(0,("STARTPAD_START")) if self.windowsize > 0: for i in range(1,self.windowsize +1): s.append("ENDPAD+" + str(i) + '_END+' + str(i)) s.insert(0,"STARTPAD-" + str(i) + '_START-' + str(i)) data = [] for letter in letters3: for sent in letter: data.append(sent) #This makes a flat list of the letters, with sentences as the items. data2 = [] for i in range(0,len(data)): data2.append([str2tuple(x, sep="_") for x in data[i]])#This splits each "word_POS" into (word, POS) data3 = [] for sent in data2: for pair in sent: data3.append(pair)#This flattens the whole thing into a big long list of tuples #self.data = [pair for pair in data3]# if pair[1] not in junk]#This returns everything, removing junk things and punk print('Processing', filename) return [(x.lower(),y) for (x,y) in data3]
def evaluate_file(self, gold_file): corpus = [] tag_convert = lambda t: (t[0].decode('utf-8'), t[1].decode('utf-8')) lines = open(gold_file).readlines() for sent in lines: corpus.append([tag_convert(str2tuple(t)) for t in sent.split()]) return super(PersianPOSTagger, self).evaluate(corpus)
def get_obs_states(tokens): """Getting states (columns) and observations (rows) for trellis""" token_tuples = [str2tuple(token) for token in tokens] obs, states = zip(*token_tuples) if "cz" in args.text: # limit tags because of memory error new_states = [state[:2] for state in states if state] states = new_states return obs, states
def _parse_utterance(self, utterance, include_tag): m = self._UTTERANCE_RE.match(utterance) if m is None: raise ValueError('Bad utterance %r' % utterance) speaker, id, text = m.groups() words = [str2tuple(s, self._SEP) for s in text.split()] if not include_tag: words = [w for (w,t) in words] return SwitchboardTurn(words, speaker, id)
def _parse_utterance(self, utterance, include_tag): m = self._UTTERANCE_RE.match(utterance) if m is None: raise ValueError('Bad utterance %r' % utterance) speaker, id, text = m.groups() words = [str2tuple(s, self._SEP) for s in text.split()] if not include_tag: words = [w for (w, t) in words] return SwitchboardTurn(words, speaker, id)
def setup_counts(files, report=False): """Setup word_tag, tag, and tag_bigram counts.""" words_tag_counts = dict() unigram_counts = dict() bigram_counts = dict() states = list() unk_words = get_unknown_words(files) vocab = set() for file in files: with open(file, 'r') as f: buffer = f.read() sents = sent_tokenize(buffer) for sent in sents: tags_in_sents = list() tagged_words = sent.strip().replace("\n", " ").replace("\t", " ") words_tags = [ ("<START>", "<START>") ] + [str2tuple(t) for t in tagged_words.split()] + [("<END>", "<END>")] for word, tag in words_tags: if word in unk_words: word = "<UNK>" # Emmission Counts. words_tag_counts[tag] = words_tag_counts.get(tag, dict()) words_tag_counts[tag][word] = words_tag_counts[tag].get( word, 0) + 1 # Unigram Counts unigram_counts[tag] = unigram_counts.get(tag, 0) + 1 # Get states in sent to generate bigrams. tags_in_sents += [tag] # Add word to vocab vocab.add(word) states += [tags_in_sents] for state in states: bigrams = _generate_ngrams(state, 2) for ti_1, ti in bigrams: bigram_counts[ti_1] = bigram_counts.get(ti_1, dict()) bigram_counts[ti_1][ti] = bigram_counts[ti_1].get(ti, 0) + 1 if report: file = '{file}'.format(file='tag_unigram_counts.txt') print_unigram_counts(unigram_counts, file=file) file = '{file}'.format(file='word_tag_counts.txt') print_bigram_counts(words_tag_counts, file=file, header="Word Tag Counts") file = '{file}'.format(file='tag_bigram_counts.txt') print_bigram_counts(bigram_counts, file=file, header="Tag Bigram Counts") return unigram_counts, bigram_counts, words_tag_counts, vocab
def _parse_utterance(self, utterance, include_tag, tagset=None): m = self._UTTERANCE_RE.match(utterance) if m is None: raise ValueError("Bad utterance %r" % utterance) speaker, id, text = m.groups() words = [str2tuple(s, self._SEP) for s in text.split()] if not include_tag: words = [w for (w, t) in words] elif tagset and tagset != self._tagset: words = [(w, map_tag(self._tagset, tagset, t)) for (w, t) in words] return SwitchboardTurn(words, speaker, id)
def read_block(self, stream): line = stream.readline() if line.startswith('<'): return [] sent = [str2tuple(word, sep='_') for word in line.split()] if self._tag_mapping_function: sent = [(w, self._tag_mapping_function(t)) for (w,t) in sent] if not self._tagged: sent = [w for (w,t) in sent] if self._group_by_sent: return [sent] else: return sent
def read_block(self, stream): line = stream.readline() if line.startswith('<'): return [] sent = [str2tuple(word, sep='_') for word in line.split()] if self._tag_mapping_function: sent = [(w, self._tag_mapping_function(t)) for (w, t) in sent] if not self._tagged: sent = [w for (w, t) in sent] if self._group_by_sent: return [sent] else: return sent
def slang_removal(data, replace, dismiss): final_data_list = [] user_tweets = [] for line in data: for tweet in line: part = [] for (word, tags) in tweet: if dismiss.get(word, "NO") == "NO": replacement = replace.get(word, word) tagged_token = tag.str2tuple(replacement + "/" + tags) part.append(tagged_token) user_tweets.append(part) final_data_list.append(user_tweets) user_tweets = [] return final_data_list
def stemming_stop_removal(porter, data): sns_pos_data = [] sns_list = [] stop = stopwords.words('english') for row in data: for tweet in row: for (word, tags) in tweet: if word not in stop: word = porter.stem(word) tagged_token = tag.str2tuple(word + "/" + tags) sns_list.append(tagged_token) else: continue sns_pos_data.append(sns_list) sns_list = [] return sns_pos_data
def get_unknown_words(files): """Return a set of words that occour less than 6 times.""" count_dict = dict() for file in files: with open(file, "r") as f: buffer = f.read() sents = sent_tokenize(buffer) for sent in sents: tagged_words = sent.strip().replace("\n", " ").replace("\t", " ") words_tags = [str2tuple(t) for t in tagged_words.split()] for word, _ in words_tags: count_dict[word] = count_dict.get(word, 0) + 1 unk_words = set( dict(filter(lambda elem: elem[1] <= 5, count_dict.items())).keys()) return unk_words
def process_paston_data(filename): with open(filename) as file: raw_text = file.read() #load raw file letters_temp = raw_text.split( '<Q') #split into letters based on <Q text letters_temp.pop(0) #remove the first item, which is blank letters = ["<Q" + i for i in letters_temp] #put the <Q back on letters2 = [i.splitlines() for i in letters] #split into lines letters3 = [i[8::6] for i in letters2 ] #select the lines which correspond to letters for letter in letters3: for index, sentence in enumerate(letter): letter[index] = sentence.split( ) #splits each sentence into word_POS chunks for l in letters3: for s in l: if self.windowsize == 0: s.append("ENDPAD_END") s.insert(0, ("STARTPAD_START")) if self.windowsize > 0: for i in range(1, self.windowsize + 1): s.append("ENDPAD+" + str(i) + '_END+' + str(i)) s.insert(0, "STARTPAD-" + str(i) + '_START-' + str(i)) data = [] for letter in letters3: for sent in letter: data.append( sent ) #This makes a flat list of the letters, with sentences as the items. data2 = [] for i in range(0, len(data)): data2.append([str2tuple(x, sep="_") for x in data[i] ]) #This splits each "word_POS" into (word, POS) data3 = [] for sent in data2: for pair in sent: data3.append( pair ) #This flattens the whole thing into a big long list of tuples #self.data = [pair for pair in data3]# if pair[1] not in junk]#This returns everything, removing junk things and punk print('Processing', filename) return [(x.lower(), y) for (x, y) in data3]
def read_block(self, stream): """Reads one paragraph at a time.""" block = [] for para_str in self._para_block_reader(stream): para = [] for sent_str in self._sent_tokenizer.tokenize(para_str): sent = [str2tuple(s, self._sep) for s in self._word_tokenizer.tokenize(sent_str)] if not self._tagged: sent = [w for (w,t) in sent] if self._group_by_sent: para.append(sent) else: para.extend(sent) if self._group_by_para: block.append(para) else: block.extend(para) return block
def read_block(self, stream): """Reads one paragraph at a time.""" block = [] for para_str in self._para_block_reader(stream): para = [] for sent_str in self._sent_tokenizer.tokenize(para_str): sent = [ str2tuple(s, self._sep) for s in self._word_tokenizer.tokenize(sent_str) ] if not self._tagged: sent = [w for (w, t) in sent] if self._group_by_sent: para.append(sent) else: para.extend(sent) if self._group_by_para: block.append(para) else: block.extend(para) return block
# Smoothing lexical model (add lambda) lpc = LexProbsCounts(get_obs_states(sup_T)) # used in BW as raw params # Smoothing unigram model (add lambda) ipc = InitProbsCounts(sup_T) # used in BW as raw params # === UNSUPERVISED PART === # Learning parameters trans_probs, emis_probs = baum_welch(unsup_T, args.iter, args.treshold) # Smoothing transition probabilities bpc.bigr_probs = trans_probs # Smoothing lexical probabilities lpc.w_t_counts = emis_probs # === DECODING PART === S_sents = split_sentences(S) if "en" in args.text: states = set(str2tuple(token)[1] for token in tokens) # all tags in the data evaluate(S_sents, states, ipc, lpc, bpc, alpha=2 ** (-70), n=20, n_path=30, lang="en", mode="bigr") # alpha for pruning, n for pruning, n_path for backtracking else: states = set(str2tuple(token)[1] for token in tokens if len(token) > 10) # all tags in the data states = set([state[:2] for state in states]) evaluate(S_sents, states, ipc, lpc, bpc, alpha=2 ** (-100), n=5, n_path=5, lang="cz", mode="bigr") # alpha for pruning, n for pruning, n_path for backtracking
def get_tags(tokens): """Getting just tags for every line""" return [str2tuple(token)[1] for token in tokens]
def split_sentences(tokens): """Splitting data into sentences""" sentences = [[ str2tuple(token) for token in sent.split("\n") if token != "" ] for sent in "\n".join(tokens).split("###/###")] return [sentence for sentence in sentences if sentence != []]
S, H, T = split_data(tokens) # Smoothing trigram model (linear interpolation) tpc = TriProbsCounts(get_tags(T)) tpc.EM(get_tags(H)) # Smoothing lexical model (add lambda) lpc = LexProbsCounts(get_obs_states(T + H)) # Smoothing unigram model (add lambda) ipc = InitProbsCounts(T + H) S_sents = split_sentences(S) if "en" in args.text: states = set(str2tuple(token)[1] for token in tokens) # all tags in the data evaluate(S_sents, states, ipc, lpc, tpc, alpha=2**(-70), n=20, n_path=30, lang="en") # alpha for pruning, n for pruning, n_path for backtracking else: states = set( str2tuple(token)[1] for token in tokens if len(token) > 10) # all tags in the data