Esempio n. 1
0
 def __init__(self, path, sep=u'/'):
     self.sentences = []
     for line in codecs.open(path, 'r', 'utf-8'):
         tuples = line.strip().split()
         self.sentences.append({
             "words" : [str2tuple(u, sep=sep)[0] for u in tuples],
             "POS" : [str2tuple(u, sep=sep)[1] for u in tuples]
         })
Esempio n. 2
0
 def __init__(self, path, sep=u'/'):
     self.sentences = []
     for line in codecs.open(path, 'r', 'utf-8'):
         tuples = line.strip().split()
         self.sentences.append({
             "words" : [str2tuple(u, sep=sep)[0] for u in tuples],
             "POS" : [str2tuple(u, sep=sep)[1] for u in tuples]
         })
Esempio n. 3
0
 def __init__(self, path, sep=u'/'):
     self.sentences = []
     with open(path, 'r') as f:
         while True:
             line = f.readline()
             if not line:
                 break
             tuples = line.strip().split()
             self.sentences.append({
                 "words": [str2tuple(u, sep=sep)[0] for u in tuples],
                 "POS": [str2tuple(u, sep=sep)[1] for u in tuples]
             })
Esempio n. 4
0
        def process_paston_data(filename):
            with open(filename) as file:
                raw_text = file.read()#load raw file
            letters_temp = raw_text.split('<Q')#split into letters based on <Q text
            letters_temp.pop(0) #remove the first item, which is blank
            letters = ["<Q"+i for i in letters_temp]#put the <Q back on
            letters2 = [i.splitlines() for i in letters]#split into lines
            letters3 = [i[8::6] for i in letters2]#select the lines which correspond to letters
            for letter in letters3:
                for index, sentence in enumerate(letter):
                    letter[index] = sentence.split()#splits each sentence into word_POS chunks
            for l in letters3:
                for s in l:
                    if self.windowsize  == 0:
                        s.append("ENDPAD_END")
                        s.insert(0,("STARTPAD_START"))
                    if self.windowsize  > 0:
                        for i in range(1,self.windowsize +1):
                            s.append("ENDPAD+" + str(i) + '_END+' + str(i))
                            s.insert(0,"STARTPAD-" + str(i) + '_START-' + str(i))
            data = []
            for letter in letters3:
                for sent in letter:
                    data.append(sent) #This makes a flat list of the letters, with sentences as the items.
            data2 = []
            for i in range(0,len(data)):
                data2.append([str2tuple(x, sep="_") for x in data[i]])#This splits each "word_POS" into (word, POS)
            data3 = []
            for sent in data2:
                for pair in sent:
                    data3.append(pair)#This flattens the whole thing into a big long list of tuples

            #self.data = [pair for pair in data3]# if pair[1] not in junk]#This returns everything, removing junk things and punk
            print('Processing', filename)
            return [(x.lower(),y) for (x,y) in data3]
Esempio n. 5
0
	def evaluate_file(self, gold_file):
		corpus = []
		tag_convert = lambda t: (t[0].decode('utf-8'), t[1].decode('utf-8'))
		lines = open(gold_file).readlines()
		for sent in lines:
			corpus.append([tag_convert(str2tuple(t)) for t in sent.split()])
		return super(PersianPOSTagger, self).evaluate(corpus)
Esempio n. 6
0
def get_obs_states(tokens):
    """Getting states (columns) and observations (rows) for trellis"""
    token_tuples = [str2tuple(token) for token in tokens]
    obs, states = zip(*token_tuples)
    if "cz" in args.text:  # limit tags because of memory error
        new_states = [state[:2] for state in states if state]
        states = new_states
    return obs, states
Esempio n. 7
0
 def _parse_utterance(self, utterance, include_tag):
     m = self._UTTERANCE_RE.match(utterance)
     if m is None:
         raise ValueError('Bad utterance %r' % utterance)
     speaker, id, text = m.groups()
     words = [str2tuple(s, self._SEP) for s in text.split()]
     if not include_tag:
         words = [w for (w,t) in words]
     return SwitchboardTurn(words, speaker, id)
Esempio n. 8
0
 def _parse_utterance(self, utterance, include_tag):
     m = self._UTTERANCE_RE.match(utterance)
     if m is None:
         raise ValueError('Bad utterance %r' % utterance)
     speaker, id, text = m.groups()
     words = [str2tuple(s, self._SEP) for s in text.split()]
     if not include_tag:
         words = [w for (w, t) in words]
     return SwitchboardTurn(words, speaker, id)
Esempio n. 9
0
def setup_counts(files, report=False):
    """Setup word_tag, tag, and tag_bigram counts."""
    words_tag_counts = dict()
    unigram_counts = dict()
    bigram_counts = dict()
    states = list()
    unk_words = get_unknown_words(files)
    vocab = set()

    for file in files:
        with open(file, 'r') as f:
            buffer = f.read()
            sents = sent_tokenize(buffer)
            for sent in sents:
                tags_in_sents = list()
                tagged_words = sent.strip().replace("\n",
                                                    " ").replace("\t", " ")
                words_tags = [
                    ("<START>", "<START>")
                ] + [str2tuple(t)
                     for t in tagged_words.split()] + [("<END>", "<END>")]
                for word, tag in words_tags:
                    if word in unk_words:
                        word = "<UNK>"
                    # Emmission Counts.
                    words_tag_counts[tag] = words_tag_counts.get(tag, dict())
                    words_tag_counts[tag][word] = words_tag_counts[tag].get(
                        word, 0) + 1
                    # Unigram Counts
                    unigram_counts[tag] = unigram_counts.get(tag, 0) + 1
                    # Get states in sent to generate bigrams.
                    tags_in_sents += [tag]
                    # Add word to vocab
                    vocab.add(word)
                states += [tags_in_sents]
    for state in states:
        bigrams = _generate_ngrams(state, 2)
        for ti_1, ti in bigrams:
            bigram_counts[ti_1] = bigram_counts.get(ti_1, dict())
            bigram_counts[ti_1][ti] = bigram_counts[ti_1].get(ti, 0) + 1

    if report:
        file = '{file}'.format(file='tag_unigram_counts.txt')
        print_unigram_counts(unigram_counts, file=file)
        file = '{file}'.format(file='word_tag_counts.txt')
        print_bigram_counts(words_tag_counts,
                            file=file,
                            header="Word Tag Counts")
        file = '{file}'.format(file='tag_bigram_counts.txt')
        print_bigram_counts(bigram_counts,
                            file=file,
                            header="Tag Bigram Counts")

    return unigram_counts, bigram_counts, words_tag_counts, vocab
Esempio n. 10
0
 def _parse_utterance(self, utterance, include_tag, tagset=None):
     m = self._UTTERANCE_RE.match(utterance)
     if m is None:
         raise ValueError("Bad utterance %r" % utterance)
     speaker, id, text = m.groups()
     words = [str2tuple(s, self._SEP) for s in text.split()]
     if not include_tag:
         words = [w for (w, t) in words]
     elif tagset and tagset != self._tagset:
         words = [(w, map_tag(self._tagset, tagset, t)) for (w, t) in words]
     return SwitchboardTurn(words, speaker, id)
Esempio n. 11
0
 def _parse_utterance(self, utterance, include_tag, tagset=None):
     m = self._UTTERANCE_RE.match(utterance)
     if m is None:
         raise ValueError("Bad utterance %r" % utterance)
     speaker, id, text = m.groups()
     words = [str2tuple(s, self._SEP) for s in text.split()]
     if not include_tag:
         words = [w for (w, t) in words]
     elif tagset and tagset != self._tagset:
         words = [(w, map_tag(self._tagset, tagset, t)) for (w, t) in words]
     return SwitchboardTurn(words, speaker, id)
Esempio n. 12
0
 def read_block(self, stream):
     line = stream.readline()
     if line.startswith('<'):
         return []
     sent = [str2tuple(word, sep='_') for word in line.split()]
     if self._tag_mapping_function:
         sent = [(w, self._tag_mapping_function(t)) for (w,t) in sent]
     if not self._tagged: sent = [w for (w,t) in sent]
     if self._group_by_sent:
         return [sent]
     else:
         return sent
Esempio n. 13
0
 def read_block(self, stream):
     line = stream.readline()
     if line.startswith('<'):
         return []
     sent = [str2tuple(word, sep='_') for word in line.split()]
     if self._tag_mapping_function:
         sent = [(w, self._tag_mapping_function(t)) for (w, t) in sent]
     if not self._tagged: sent = [w for (w, t) in sent]
     if self._group_by_sent:
         return [sent]
     else:
         return sent
def slang_removal(data, replace, dismiss):
    final_data_list = []
    user_tweets = []
    for line in data:
        for tweet in line:
            part = []
            for (word, tags) in tweet:
                if dismiss.get(word, "NO") == "NO":
                    replacement = replace.get(word, word)
                    tagged_token = tag.str2tuple(replacement + "/" + tags)
                    part.append(tagged_token)
            user_tweets.append(part)
        final_data_list.append(user_tweets)
        user_tweets = []
    return final_data_list
def stemming_stop_removal(porter, data):
    sns_pos_data = []
    sns_list = []
    stop = stopwords.words('english')
    for row in data:
        for tweet in row:
            for (word, tags) in tweet:
                if word not in stop:
                    word = porter.stem(word)
                    tagged_token = tag.str2tuple(word + "/" + tags)
                    sns_list.append(tagged_token)
                else:
                    continue
            sns_pos_data.append(sns_list)
            sns_list = []
    return sns_pos_data
Esempio n. 16
0
def get_unknown_words(files):
    """Return a set of words that occour less than 6 times."""
    count_dict = dict()

    for file in files:
        with open(file, "r") as f:
            buffer = f.read()
            sents = sent_tokenize(buffer)
            for sent in sents:
                tagged_words = sent.strip().replace("\n",
                                                    " ").replace("\t", " ")
                words_tags = [str2tuple(t) for t in tagged_words.split()]
                for word, _ in words_tags:
                    count_dict[word] = count_dict.get(word, 0) + 1

    unk_words = set(
        dict(filter(lambda elem: elem[1] <= 5, count_dict.items())).keys())
    return unk_words
Esempio n. 17
0
        def process_paston_data(filename):
            with open(filename) as file:
                raw_text = file.read()  #load raw file
            letters_temp = raw_text.split(
                '<Q')  #split into letters based on <Q text
            letters_temp.pop(0)  #remove the first item, which is blank
            letters = ["<Q" + i for i in letters_temp]  #put the <Q back on
            letters2 = [i.splitlines() for i in letters]  #split into lines
            letters3 = [i[8::6] for i in letters2
                        ]  #select the lines which correspond to letters
            for letter in letters3:
                for index, sentence in enumerate(letter):
                    letter[index] = sentence.split(
                    )  #splits each sentence into word_POS chunks
            for l in letters3:
                for s in l:
                    if self.windowsize == 0:
                        s.append("ENDPAD_END")
                        s.insert(0, ("STARTPAD_START"))
                    if self.windowsize > 0:
                        for i in range(1, self.windowsize + 1):
                            s.append("ENDPAD+" + str(i) + '_END+' + str(i))
                            s.insert(0,
                                     "STARTPAD-" + str(i) + '_START-' + str(i))
            data = []
            for letter in letters3:
                for sent in letter:
                    data.append(
                        sent
                    )  #This makes a flat list of the letters, with sentences as the items.
            data2 = []
            for i in range(0, len(data)):
                data2.append([str2tuple(x, sep="_") for x in data[i]
                              ])  #This splits each "word_POS" into (word, POS)
            data3 = []
            for sent in data2:
                for pair in sent:
                    data3.append(
                        pair
                    )  #This flattens the whole thing into a big long list of tuples

            #self.data = [pair for pair in data3]# if pair[1] not in junk]#This returns everything, removing junk things and punk
            print('Processing', filename)
            return [(x.lower(), y) for (x, y) in data3]
Esempio n. 18
0
 def read_block(self, stream):
     """Reads one paragraph at a time."""
     block = []
     for para_str in self._para_block_reader(stream):
         para = []
         for sent_str in self._sent_tokenizer.tokenize(para_str):
             sent = [str2tuple(s, self._sep) for s in
                     self._word_tokenizer.tokenize(sent_str)]
             if not self._tagged:
                 sent = [w for (w,t) in sent]
             if self._group_by_sent:
                 para.append(sent)
             else:
                 para.extend(sent)
         if self._group_by_para:
             block.append(para)
         else:
             block.extend(para)
     return block
Esempio n. 19
0
 def read_block(self, stream):
     """Reads one paragraph at a time."""
     block = []
     for para_str in self._para_block_reader(stream):
         para = []
         for sent_str in self._sent_tokenizer.tokenize(para_str):
             sent = [
                 str2tuple(s, self._sep)
                 for s in self._word_tokenizer.tokenize(sent_str)
             ]
             if not self._tagged:
                 sent = [w for (w, t) in sent]
             if self._group_by_sent:
                 para.append(sent)
             else:
                 para.extend(sent)
         if self._group_by_para:
             block.append(para)
         else:
             block.extend(para)
     return block
Esempio n. 20
0
    # Smoothing lexical model (add lambda)
    lpc = LexProbsCounts(get_obs_states(sup_T))     # used in BW as raw params

    # Smoothing unigram model (add lambda)
    ipc = InitProbsCounts(sup_T)    # used in BW as raw params

    # === UNSUPERVISED PART ===

    # Learning parameters
    trans_probs, emis_probs = baum_welch(unsup_T, args.iter, args.treshold)

    # Smoothing transition probabilities
    bpc.bigr_probs = trans_probs

    # Smoothing lexical probabilities
    lpc.w_t_counts = emis_probs

    # === DECODING PART ===

    S_sents = split_sentences(S)

    if "en" in args.text:
        states = set(str2tuple(token)[1] for token in tokens)  # all tags in the data
        evaluate(S_sents, states, ipc, lpc, bpc, alpha=2 ** (-70), n=20, n_path=30, lang="en", mode="bigr")
        # alpha for pruning, n for pruning, n_path for backtracking
    else:
        states = set(str2tuple(token)[1] for token in tokens if len(token) > 10)  # all tags in the data
        states = set([state[:2] for state in states])
        evaluate(S_sents, states, ipc, lpc, bpc, alpha=2 ** (-100), n=5, n_path=5, lang="cz", mode="bigr")
        # alpha for pruning, n for pruning, n_path for backtracking
Esempio n. 21
0
def get_tags(tokens):
    """Getting just tags for every line"""
    return [str2tuple(token)[1] for token in tokens]
Esempio n. 22
0
def split_sentences(tokens):
    """Splitting data into sentences"""
    sentences = [[
        str2tuple(token) for token in sent.split("\n") if token != ""
    ] for sent in "\n".join(tokens).split("###/###")]
    return [sentence for sentence in sentences if sentence != []]
Esempio n. 23
0
    S, H, T = split_data(tokens)

    # Smoothing trigram model (linear interpolation)
    tpc = TriProbsCounts(get_tags(T))
    tpc.EM(get_tags(H))

    # Smoothing lexical model (add lambda)
    lpc = LexProbsCounts(get_obs_states(T + H))

    # Smoothing unigram model (add lambda)
    ipc = InitProbsCounts(T + H)

    S_sents = split_sentences(S)

    if "en" in args.text:
        states = set(str2tuple(token)[1]
                     for token in tokens)  # all tags in the data
        evaluate(S_sents,
                 states,
                 ipc,
                 lpc,
                 tpc,
                 alpha=2**(-70),
                 n=20,
                 n_path=30,
                 lang="en")
        # alpha for pruning, n for pruning, n_path for backtracking
    else:
        states = set(
            str2tuple(token)[1] for token in tokens
            if len(token) > 10)  # all tags in the data