Ejemplo n.º 1
0
    def load_data(self, labeled_data, ids):
        self.message = {}

        labels_esit = []

        for i in ids:
            sentences = []
            labels = []
            doc_len = []
            sent_len = []

            sents, l = labeled_data[i]

            for j in range(0, len(sents)):

                sents[j] = str(sents[j])

                results = re.compile(r'www.[a-zA-Z0-9.?/&=:#%_-]*', re.S)
                dd = results.sub(" <website> ", sents[j])
                results = re.compile(r'http[a-zA-Z0-9.?/&=:#%_-]*', re.S)
                dd = results.sub(" <website> ", dd)
                results = re.compile(
                    r'[a-zA-Z0-9.?/&=:#%_-]*.(com|net|org|io|gov|me|edu)',
                    re.S)
                dd = results.sub(" <website> ", dd)

                a = regexp_tokenize(transform_format(dd), self.pattern)

                temp = []
                for k in range(0, len(a)):
                    if a[k] not in self.english_punctuations and check_ack_word(
                            a[k]) == 1:
                        if a[k].isdigit():
                            a[k] = '<number>'
                        elif a[k][0] == '$':
                            a[k] = '<money>'
                        elif a[k][-1] == '%':
                            a[k] = '<percentage>'
                        temp.append(a[k].lower())

                if len(temp) > 0:
                    temp_ = ['<sos>']
                    for k in range(0, min(len(temp), self.max_seq_len - 2)):
                        temp_.append(temp[k])
                    temp_.append('<eos>')
                    sentences.append(temp_)
                    labels.append(self.lookup_label_id(l[j]))

                    labels_esit.append(self.lookup_label_id(l[j]))

                    sent_len.append(len(temp_) - 1)

            doc_len.append(len(sents) - 1)

            self.message[i] = (sentences, labels, sent_len, doc_len)

        x_d = set()
        for (u, v) in self.label_set.items():
            x_d.add(v)
        x_d = np.array(list(x_d))

        self.kde.fit(np.array(labels_esit)[:, None])
        self.dist = self.kde.score_samples(x_d[:, None])

        self.esit_dist = F.softmax(torch.tensor(self.dist), dim=-1)
Ejemplo n.º 2
0
    def load_data(self, unlabeled_data, ids):
        self.message = {}
        self.ids = []
        self.data_num = 0

        for i in ids:
            try:
                sentences = []
                labels = []
                doc = unlabeled_data[i]

                doc_len = []
                sent_len = []

                doc += '.'

                results = re.compile(r'http[a-zA-Z0-9.?/&=:#%_-]*', re.S)
                dd = results.sub(" <website> ", doc)
                results = re.compile(r'www.[a-zA-Z0-9.?/&=:#%_-]*', re.S)
                dd = results.sub(" <website> ", dd)
                results = re.compile(
                    r'[a-zA-Z0-9.?/&=:#%_-]*.(com|net|org|io|gov|me|edu)',
                    re.S)
                dd = results.sub(" <website> ", dd)

                sents = sentence_tokenize(dd)

                # print(sents)

                for j in range(0, len(sents)):
                    a = regexp_tokenize(transform_format(sents[j]),
                                        self.pattern)
                    temp = []
                    for k in range(0, len(a)):
                        if a[k] not in self.english_punctuations and check_ack_word(
                                a[k]) == 1:
                            if a[k].isdigit():
                                a[k] = '<number>'
                            elif a[k][0] == '$':
                                a[k] = '<money>'
                            elif a[k][-1] == '%':
                                a[k] = '<percentage>'
                            temp.append(a[k].lower())

                    if len(temp) > 0:
                        temp_ = ['<sos>']
                        for k in range(0, min(len(temp),
                                              self.max_seq_len - 2)):
                            temp_.append(temp[k])
                        temp_.append('<eos>')
                        sentences.append(temp_)
                        labels.append(10)
                        sent_len.append(len(temp_) - 1)

                doc_len.append(min(len(sents) - 1, self.max_seq_num - 1))

                self.message[i] = (sentences[:self.max_seq_num],
                                   labels[:self.max_seq_num],
                                   sent_len[:self.max_seq_num], doc_len)
                self.ids.append(i)

            except:
                #print(doc)
                #exit()
                pass
Ejemplo n.º 3
0
    def build_vocab(self, unlabeled_data, labeled_data, embedding_size,
                    max_seq_num, max_seq_len):
        sentences = []
        words = []
        if unlabeled_data is not None:
            for (u, v) in unlabeled_data.items():
                try:
                    results = re.compile(r'http[a-zA-Z0-9.?/&=:#%_-]*', re.S)
                    dd = results.sub(" <website> ", v)
                    results = re.compile(r'www.[a-zA-Z0-9.?/&=:#%_-]*', re.S)
                    dd = results.sub(" <website> ", dd)
                    results = re.compile(
                        r'[a-zA-Z0-9.?/&=:#%_-]*.(com|net|org|io|gov|me|edu)',
                        re.S)
                    dd = results.sub(" <website> ", dd)
                    sents = sentence_tokenize(dd)
                    for j in range(0, len(sents)):
                        a = regexp_tokenize(transform_format(sents[j]),
                                            self.pattern)
                        temp = []
                        for k in range(0, len(a)):
                            if a[k] not in self.english_punctuations and check_ack_word(
                                    a[k]) == 1:
                                if a[k].isdigit():
                                    a[k] = '<number>'
                                elif a[k][0] == '$':
                                    a[k] = '<money>'
                                elif a[k][-1] == '%':
                                    a[k] = '<percentage>'
                                temp.append(a[k].lower())
                                words.append(a[k].lower())
                        if len(temp) > 0:
                            sentences.append(temp)
                except:
                    #print(u,v)
                    #exit()
                    pass

        if labeled_data is not None:
            for (u, v) in labeled_data.items():
                for i in range(0, len(v[0])):
                    v[0][i] = str(v[0][i])
                    try:
                        results = re.compile(r'http[a-zA-Z0-9.?/&=:#%_-]*',
                                             re.S)
                        dd = results.sub(" <website> ", v[0][i])
                        results = re.compile(r'www.[a-zA-Z0-9.?/&=:#%_-]*',
                                             re.S)
                        dd = results.sub(" <website> ", dd)
                        results = re.compile(
                            r'[a-zA-Z0-9.?/&=:#%_-]*.(com|net|org|io|gov|me|edu)',
                            re.S)
                        dd = results.sub(" <website> ", dd)
                    except:
                        print(u, v)
                        print(v[0][i])
                        exit()
                    a = regexp_tokenize(transform_format(dd), self.pattern)
                    temp = []
                    for k in range(0, len(a)):
                        if a[k] not in self.english_punctuations and check_ack_word(
                                a[k]) == 1:
                            if a[k].isdigit():
                                a[k] = '<number>'
                            elif a[k][0] == '$':
                                a[k] = '<money>'
                            elif a[k][-1] == '%':
                                a[k] = '<percentage>'
                            temp.append(a[k].lower())
                            words.append(a[k].lower())
                    if len(temp) > 0:
                        sentences.append(temp)

        word_frequency = {}
        for i in range(0, len(words)):
            if words[i] in word_frequency:
                word_frequency[words[i]] += 1
            else:
                word_frequency[words[i]] = 1

        self.model = gensim.models.Word2Vec(sentences,
                                            size=embedding_size,
                                            window=5,
                                            min_count=1,
                                            iter=20,
                                            negative=50)

        x = 4
        self.word2id['<pad>'] = 0
        self.id2word[0] = '<pad>'
        self.word2id['<sos>'] = 2
        self.id2word[2] = '<sos>'
        self.word2id['<eos>'] = 3
        self.id2word[3] = '<eos>'

        self.unk_count = 0

        for i in range(0, len(sentences)):
            for j in range(0, len(sentences[i])):
                if word_frequency[sentences[i][j].lower()] >= 2:
                    if sentences[i][j].lower() in self.model:
                        if sentences[i][j].lower() in self.word2id:
                            pass
                        else:
                            self.word2id[sentences[i][j].lower()] = x
                            self.id2word[x] = sentences[i][j].lower()
                            x = x + 1
                else:
                    self.word2id['<unk>'] = 1
                    self.id2word[1] = '<unk>'
                    self.unk_count += 1