Esempio n. 1
0
def main():
    print('Load raw data')
    data = utils.load_dumped('../data/raw/dump.txt')

    print('Filter text')
    content = [utils.filter_text(_[1]) for _ in tqdm(data)]

    idx = np.arange(len(content))
    np.random.seed(19)
    np.random.shuffle(idx)

    test_len = int(0.1 * len(idx))

    print('Split into train/test')
    test = "".join(content[_] for _ in tqdm(idx[:test_len]))
    train = "".join(content[_] for _ in tqdm(idx[test_len:]))

    vocab = utils.generate_vocab()
    with open('../data/processed/vocab.json', 'w') as fout:
        json.dump(vocab, fout)

    print('Encoding test')
    test = utils.encode_text(test, vocab)
    np.save('../data/processed/test', test)

    print('Encoding train')
    train = utils.encode_text(train, vocab)
    np.save('../data/processed/train', train)
Esempio n. 2
0
def test():
    model = keras.models.load_model("tf/model.h5")
    word_index = modify_word_index()

    with open('tf/test_review.txt', encoding='utf-8 ') as file:
        review = filter_text(file.read())
        print(review)
        encode = add_padding([encode_review(review, word_index)],
                             value=word_index['<PAD>'],
                             maxlen=250)
        print(decode_review(encode_review(review, word_index), word_index))
        predict = model.predict(encode)
        print("\n\n")
        print("Original Review link:  "
              "https://timesofindia.indiatimes.com/"
              "entertainment/marathi/movie-reviews/"
              "basta/movie-review/80670252.cms")
        print("Rating on Site: 2.5/5")
        print("Prediction: ", predict[0])
Esempio n. 3
0
    def get_queries(self):
        """Get list of queries from the metadata of the score for querying audio sources spotify or echonest

        The list of queries are created from title composer and subtitle of the score.
        From lower index to higher of the list strictness of the length and coverage of the query decreases and probability of finding results for the query increases:

        | strictness:query 
        | 0:title + composer + subtitle 
        | 1:title + composer + subtitle with stopwords removed
        | 2:title + composer
        | 3:title + composer with stopwords removed
        | 4:title
        | 5:title with stopwords removed

        | Identical queries with lower strictness removed from the list

        Returns:
            list of tuples (query,strictness)
        """

        tokens = [self.title, self.composer, self.subtitle]
        for token in tokens:
            token = ' '.join(token.split())
        queries = []
        query_temp = []
        j = 0
        for i in range(len(tokens), 0, -1):
            query = ' '.join(tokens[:i])
            query = utils.remove_multi_spaces_from_text(query)
            query = utils.normalize(query)
            if query not in query_temp:
                queries.append((query, 2 * j))
                query_temp.append(query)
            queryFiltered = utils.filter_text(query)
            if queryFiltered != query:
                queryFiltered = ' '.join(queryFiltered.split())
                if queryFiltered not in query_temp:
                    queries.append((queryFiltered, 2 * j + 1))
            j = j + 1
        return queries
Esempio n. 4
0
    def load_dataset(self, source_file='train.json'):

        with open(source_file) as dataset_file:
            self.qa_dataset = json.load(dataset_file)

            print(self.qa_dataset)

        dialogues = [(dialogue_line['question'], dialogue_line['answer'])
                     for dialogue_line in self.qa_dataset]

        questions = set()
        qa_dataset = []

        for replicas in dialogues:
            if len(replicas) < 2:
                continue

            # remove /n, ? and .
            question = filter_text(replicas[0][2:])
            answer = replicas[1][2:]

            if question and question not in questions:
                questions.add(question)
                qa_dataset.append([question, answer])

        qa_by_word_dataset = {}  # {'word': [[q, a], ...]}
        for question, answer in qa_dataset:
            words = question.split(' ')
            for word in words:
                if word not in qa_by_word_dataset:
                    qa_by_word_dataset[word] = []
                qa_by_word_dataset[word].append((question, answer))

        qa_by_word_dataset_filtered = {
            word: qa_list
            for word, qa_list in qa_by_word_dataset.items()
            if len(qa_list) < 1000
        }
        return qa_by_word_dataset_filtered
Esempio n. 5
0
    def generate_answer_by_text(self,
                                text,
                                consider_last_n=1000,
                                min_result_threshold=0.2):
        text = filter_text(text)
        words = text.split(' ')
        qa = []
        qa_by_word_dataset_filtered = self.load_dataset()
        for word in words:
            if word in qa_by_word_dataset_filtered:
                qa += qa_by_word_dataset_filtered[word]
        qa = list(set(qa))[:consider_last_n]

        results = []
        for question, answer in qa:
            dist = nltk.edit_distance(question, text)
            dist_percentage = dist / len(question)
            results.append([dist_percentage, question, answer])

        if results:
            dist_percentage, question, answer = min(results,
                                                    key=lambda pairs: pairs[0])
            if dist_percentage < min_result_threshold:
                return answer
Esempio n. 6
0
def process_file(input, mask):

    # create dictionary of words from text file
    lines = utils.remove_punctuation(input)
    cleaned_words = utils.filter_text(lines)
    frequencies = utils.calculate_frequencies(cleaned_words)

    # mask for word cloud
    cloud_mask = None
    if mask:
        try:
            # resize the amsk image
            basewidth = 1920
            img = Image.open(mask)
            wpercent = (basewidth / float(img.size[0]))
            hsize = int((float(img.size[1]) * float(wpercent)))
            img = img.resize((basewidth, hsize), Image.ANTIALIAS)

            # generate arry of mask image
            cloud_mask = np.array(img)

        except UnidentifiedImageError:
            raise SystemExit(f"Error: cannot identify image file {mask}")

    # generate image form word cloud
    cloud = WordCloud(width=1920,
                      height=1080,
                      background_color="black",
                      mask=cloud_mask,
                      contour_width=1,
                      contour_color='black',
                      colormap='Set2',
                      collocations=False)
    cloud.generate_from_frequencies(frequencies)

    return cloud
Esempio n. 7
0
	def save(self, *args, **kwargs):
		if not self.id:
			self.created_at = datetime.now()
		self.body_filter = 'Markdown'
		self.body_html = filter_text(self.body, self.body_filter)
		super(Notice, self).save(*args, **kwargs)
Esempio n. 8
0
 def save(self, *args, **kwargs):
     if not self.id:
         self.created_at = datetime.now()
     self.body_filter = 'Markdown'
     self.body_html = filter_text(self.body, self.body_filter)
     super(Notice, self).save(*args, **kwargs)