def read_sequence_list_conll(self, train_file, max_sent_len=100000, max_nr_sent=100000):
     """ Read a conll2002 or conll2003 file into a sequence list."""
     instance_list = self.read_conll_instances(train_file, max_sent_len, max_nr_sent)
     seq_list = SequenceList(self.word_dict)  # for indices
     for sent_x in instance_list:
         seq_list.add_sequence(sent_x)
     return seq_list
Example #2
0
    def prepare_chains(self):

        # training sequences
        self.train = SequenceList(self.x_dict)
        print("Creating training from corpus.")
        with open(self.corpus_file) as IN:
            for c, l in enumerate(IN, 1):
                if c > self.howbig:
                    break
                self.train.add_sequence([w for w in l.strip().split(" ")])
 def read_sequence_list_conll(self,
                              train_file,
                              max_sent_len=100000,
                              max_nr_sent=100000):
     """ Read a conll2002 or conll2003 file into a sequence list."""
     instance_list = self.read_conll_instances(train_file, max_sent_len,
                                               max_nr_sent)
     seq_list = SequenceList(self.word_dict)  # for indices
     for sent_x in instance_list:
         seq_list.add_sequence(sent_x)
     return seq_list
Example #4
0
    def __init__(self):
        #observation vocabulary
        self.x_dict = LabelDictionary(["walk", "shop", "clean", "tennis"])

        #training sequences
        train_seqs = SequenceList(self.x_dict)
        train_seqs.add_sequence(["walk", "walk", "shop", "clean"])
        train_seqs.add_sequence(["walk", "walk", "shop", "clean"])
        train_seqs.add_sequence(["walk", "shop", "shop", "clean"])

        self.train = train_seqs
Example #5
0
def load_seq(s):
    from sequences.sequence_list import SequenceList
    seq_list = SequenceList(word_dict, tag_dict)
    words = []
    tags = []
    line = s.rstrip()
    pairs = line.split(' ')

    for pair in pairs:
        fields = pair.split('_')
        words.append(fields[0])
        tags.append(fields[1])

    seq_list.add_sequence(words, tags)
    return seq_list[0]
Example #6
0
class TextCorpus:
    def __init__(self, corpus_file, minfreq=0, howbig=10000):
        """
        :param minfreq: minimum frequency of a word in order to be taken into account
        :param howbig: number of sentences to take into account
        """
        self.corpus_file = corpus_file
        self.vocab_file = "{}.vocab".format(
            self.corpus_file)  # file of form: w\tf\n

        self.minfreq = minfreq
        self.howbig = howbig
        try:
            self.x_dict = LabelDictionary(
                read_vocab(self.vocab_file, self.minfreq))
        except IOError:
            self.prepare_vocab_dict()
            self.x_dict = LabelDictionary(
                read_vocab(self.vocab_file, self.minfreq))

        print("LabelDictionary created.")

    def prepare_chains(self):

        # training sequences
        self.train = SequenceList(self.x_dict)
        print("Creating training from corpus.")
        with open(self.corpus_file) as IN:
            for c, l in enumerate(IN, 1):
                if c > self.howbig:
                    break
                self.train.add_sequence([w for w in l.strip().split(" ")])

    def prepare_vocab_dict(self):
        from collections import defaultdict

        vocab_dict = defaultdict(int)

        with open(self.corpus_file) as IN, open(self.vocab_file, "w") as OUT:
            for c, l in enumerate(IN, 1):
                if c > self.howbig:
                    break
                for w in l.strip().split(" "):
                    vocab_dict[w] += 1
            for w, f in vocab_dict.items():
                OUT.write("{}\t{}\n".format(w, f))
        print("Vocabulary file prepared.")
Example #7
0
class TextCorpus:
    def __init__(self, corpus_file, minfreq=0, howbig=10000):
        """
        :param minfreq: minimum frequency of a word in order to be taken into account
        :param howbig: number of sentences to take into account
        """
        self.corpus_file = corpus_file
        self.vocab_file = "{}.vocab".format(self.corpus_file)  # file of form: w\tf\n

        self.minfreq = minfreq
        self.howbig = howbig
        try:
            self.x_dict = LabelDictionary(read_vocab(self.vocab_file, self.minfreq))
        except IOError:
            self.prepare_vocab_dict()
            self.x_dict = LabelDictionary(read_vocab(self.vocab_file, self.minfreq))

        print("LabelDictionary created.")

    def prepare_chains(self):

        # training sequences
        self.train = SequenceList(self.x_dict)
        print("Creating training from corpus.")
        with open(self.corpus_file) as IN:
            for c, l in enumerate(IN, 1):
                if c > self.howbig:
                    break
                self.train.add_sequence([w for w in l.strip().split(" ")])

    def prepare_vocab_dict(self):
        from collections import defaultdict

        vocab_dict = defaultdict(int)

        with open(self.corpus_file) as IN, open(self.vocab_file, "w") as OUT:
            for c, l in enumerate(IN, 1):
                if c > self.howbig:
                    break
                for w in l.strip().split(" "):
                    vocab_dict[w] += 1
            for w, f in vocab_dict.items():
                OUT.write("{}\t{}\n".format(w, f))
        print("Vocabulary file prepared.")
Example #8
0
    def prepare_chains(self):

        # training sequences
        self.train = SequenceList(self.x_dict)
        print("Creating training from corpus.")
        with open(self.corpus_file) as IN:
            for c, l in enumerate(IN, 1):
                if c > self.howbig:
                    break
                self.train.add_sequence([w for w in l.strip().split(" ")])
Example #9
0
def load_seq(s):
    from sequences.sequence_list import SequenceList
    seq_list = SequenceList(word_dict, tag_dict)
    ex_x = []
    ex_y = []
    contents = s.decode('string-escape').split('\n')
    for line in contents:
        toks = line.split()
        if len(toks) < 2:
            continue
        pos = toks[4]
        word = toks[1]
        pos = pos.lower()

        assert pos in mapping
        assert word in word_dict

        pos = mapping[pos]
        assert pos in tag_dict

        ex_x.append(word)
        ex_y.append(pos)
    seq_list.add_sequence(ex_x, ex_y)
    return seq_list[0]
Example #10
0
def load_seq(s):
    from sequences.sequence_list import SequenceList
    seq_list = SequenceList(word_dict, tag_dict)
    ex_x = []
    ex_y = []
    contents = s.decode('string-escape').split('\n')
    for line in contents:
        toks = line.split()
        if len(toks) < 2:
            continue
        pos = toks[4]
        word = toks[1]
        pos = pos.lower()

        assert pos in mapping
        assert word in word_dict

        pos = mapping[pos]
        assert pos in tag_dict

        ex_x.append(word)
        ex_y.append(pos)
    seq_list.add_sequence(ex_x, ex_y)
    return seq_list[0]
Example #11
0
    def __init__(self):
        #observation vocabulary
        self.x_dict = LabelDictionary(["walk", "shop", "clean", "tennis"])

        #training sequences
        train_seqs = SequenceList(self.x_dict)
        train_seqs.add_sequence(["walk", "walk", "shop", "clean"])
        train_seqs.add_sequence(["walk", "walk", "shop", "clean"])
        train_seqs.add_sequence(["walk", "shop", "shop", "clean"])

        self.train = train_seqs
 def __init__(self):
     self.word_dict = LabelDictionary()
     self.sequence_list = SequenceList(self.word_dict)