def make_w2vec_matrix(question, paragraph, model=word2vec):
    train_question = preprocess_sentence(question)
    train_answers = preprocess_sentence(paragraph)
    tokens_question = ViTokenizer.tokenize(train_question).split()
    tokens_answer = ViTokenizer.tokenize(train_answers).split()
    question_embs = []
    answer_embs = []
    for i in range(len(tokens_question)):
        if tokens_question[i] in model:
            question_embs.append(model[tokens_question[i]])
        else:
            question_embs.append(model['unknown'])
    for i in range(len(tokens_answer)):
        if tokens_answer[i] in model:
            answer_embs.append(model[tokens_answer[i]])
        else:
            answer_embs.append(model['unknown'])
    question_embs = np.array(question_embs)
    answer_embs = np.array(answer_embs)
    """
	if question_embs.shape[0] < MIN_LENGTH_QUESTION:
	question_embs = np.pad(question_embs, ((4,4), (0,0)))
	"""

    if answer_embs.shape[0] < MIN_LENGTH_ANSWER:
        paddings = np.ceil(MIN_LENGTH_ANSWER / answer_embs.shape[0])
        d = np.copy(answer_embs)
        for i in range(int(paddings)):
            answer_embs = np.concatenate((answer_embs, d))

    return question_embs, answer_embs
    def query(self, query_string):
        query_string = self.process_spell_errors(query_string)
        query_terms = preprocess_sentence(query_string)
        query_docs = []
        for i in range(len(query_terms)):
            newdocdict = copy.deepcopy(self.index[query_terms[i]])
            for doc in newdocdict:
                #doc is an id with value = list
                newdocdict[doc] = set(map(lambda x: x - i, newdocdict[doc]))
            query_docs.append(newdocdict)

        #query docs is a list of dictionaries, where each dict correpsonds to the posting list of 1 query term
        answer = []
        for doc in query_docs[0]:
            #do something
            docflag = True
            for position in query_docs[0][doc]:
                posflag = True
                for other_doclist in query_docs[1:]:
                    if doc not in other_doclist:
                        docflag = False
                        break
                    if position not in other_doclist[doc]:
                        posflag = False
                        break

                if docflag and posflag:
                    answer.append(doc)
                    break
                if not docflag:
                    break
        return answer
Ejemplo n.º 3
0
def index():
    if request.method == 'POST':
        review = request.form['review']
        data = preprocess_sentence(review)
        aspect_extractor = AspectExtractor()
        bio, aspect_terms = aspect_extractor.extract_aspect(data, review)
        bio = convert_bio(bio)
        aspects = get_aspects(data, bio)
        aspect_map = {}
        for i in range(len(aspects)):
            aspect_map[aspect_terms[i]] = aspects[i]
        sentiment_food = predictData([review], "food")
        sentiment_price = predictData([review], "price")
        sentiment_place = predictData([review], "place")
        sentiment_service = predictData([review], "service")
        return render_template('index.html',
                               review=review,
                               bio=bio,
                               aspect_terms=aspect_terms,
                               aspects=aspect_map,
                               food=sentiment_food,
                               price=sentiment_price,
                               place=sentiment_place,
                               service=sentiment_service)
    else:
        return render_template('index.html')
def generate_actions(path):
    """
    Reads csv through csv.DictReader() and yields a single document for each record.
    This function is passed into the bulk() helper to create many documents in sequence.
    """
    uid = 0
    for _csv in tqdm(sorted(os.listdir(path))):
        file = os.path.join(path, _csv)
        with open(file, mode="r") as f:
            reader = csv.DictReader(f)
            for row in reader:
                doc = {
                    "id":
                    uid,
                    "document_name":
                    _csv,
                    # "URL" : row["\ufeffURL"],
                    # "MatchDateTime" : row["MatchDateTime"],
                    "Station":
                    row["Station"],
                    "Show":
                    row["Show"],
                    # "IAShowID" : row["IAShowID"],
                    # "IAPreviewThumb" : row["IAPreviewThumb"],
                    "Snippet":
                    " ".join(preprocess_sentence(row["Snippet"]))
                    if config_params["es_preprocess"] else row['Snippet']
                }
                uid += 1
                yield doc
Ejemplo n.º 5
0
def main():
    tweetfile = "data/tweets/clean/clean.csv"
    # df = load_df(tweetfile)

    # some shitty CLI
    args = sys.argv[1:]
    if len(args) < 2:
        print(
            "LOL. Please input in the format <loopback value> <word1> <word2> ..."
        )
        print("Example: tweetgen2.py 2 my life")
        return
    n = int(args[0])
    initial_words = args[1:]

    mc = MarkovChain(lookback=n)
    mc.train(load_df(tweetfile)['text'].values.tolist())

    # initial_words = ['we', 'tend', 'to']
    # initial_words = ['life', 'is']
    tweet = mc.generate(initial_words)
    print("Generated tweet::\n{}".format(tweet))
    print('-' * 30)
    print("After preprocessing <SENTENCE>::\n{}".format(
        preprocess_sentence(tweet)))
Ejemplo n.º 6
0
def compare_scores(snippets):
    corpus = ''.join(snippets)
    corpus_list = corpus.split('.')
    scores_dict = dict()
    F1avg = 0
    valid_F1 = 0
    happ_avg_time = 0
    es_avg_time = 0
    for query in snippets:
        count = 0
        print("iteration number:", valid_F1)
        print(query)

        # preprocess the query
        if config_params["es_preprocess"]:
            query = " ".join(preprocess_sentence(query))
        if (len(query.split()) < 4):
            count += 1
            continue
        scores = metrics.metrics(query)

        precision = scores[0] / (scores[0] + scores[1] + 1e-9)
        recall = scores[0] / (scores[0] + scores[2] + 1e-9)
        F1 = 2 * precision * recall / (precision + recall + 1e-9)
        scores_dict[query] = [F1, precision, recall, scores[4], scores[5]]
        valid_F1 += 1
        happ_avg_time += scores[4]
        es_avg_time += scores[5]
        F1avg += F1
        print('scores:', scores)
        print('F1-score:', F1, 'precision:', precision, 'recall:', recall)
        print()
    print(F1avg / (valid_F1), happ_avg_time / valid_F1, es_avg_time / valid_F1)
    return scores_dict
Ejemplo n.º 7
0
def detect_topic(inp):
	sentence = preprocess_sentence(inp)
	subject_freq = defaultdict(int)
	words = set(sentence.split())
	subjects_found = False
	for w in words:
		ignore = sum([1 if w.startswith(i) else 0 for i in ignore_words])
		if not ignore:
			try:
				print (w, word_subject_map[w])
				for k, v in word_subject_map[w].items():
					subject_freq[k] += v
				if not subjects_found:
					subjects_found = True
			except KeyError:
				pass
	if subjects_found:
		subject_scores = list()

		for k, v in subject_freq.items():
			subject_scores.append([k, v])

		subject_scores.sort(key=lambda x: x[1], reverse=True)
		print (subject_scores)
		up_to_index = 1
		threshold_score = remove_bottom * subject_scores[0][1]
		for i in range(1, len(subject_scores)):
			if subject_scores[i][1] >= threshold_score:
				up_to_index += 1
		return subject_scores[:up_to_index] 
	else:
		return ("Could not detect subject")
Ejemplo n.º 8
0
def create_dataset(path, num_examples):
    lines = io.open(path, encoding="UTF-8").read().strip().split("\n")

    word_pairs = [
        [preprocess_sentence(w) for w in l.split("\t")] for l in lines[:num_examples]
    ]

    return zip(*word_pairs)
Ejemplo n.º 9
0
def inline(model, model_config, src_vocab, tgt_vocab, src_word_ind, tgt_word_ind, sentence):
    sentence = preprocess_sentence(sentence)
    src_tensor = [[int(src_word_ind[i]) if i in src_word_ind else 0 for i in sentence.split(' ')] + [0]*(8-len(sentence.split(' ')))]
    
    src_tensor = np.array(src_tensor)
    
    enc_hidden = tf.zeros((len(src_tensor), model_config['reccurent_hidden']))
    dec_input =  tf.expand_dims([int(tgt_word_ind['<start>'])] * len(src_tensor), 1)
    
    preds = model.predict([src_tensor, enc_hidden, dec_input])
    preds = np.array([preds[i].argmax(axis = 1) for i in range(len(preds))])
    preds = preds.swapaxes(0,1)
    
    print("Predicted Sentence is: ", *[tgt_vocab[str(i)] for i in preds[0]]) # This will not contain the first start token
Ejemplo n.º 10
0
def main():
    tweetfile = "data/tweets/clean/clean.csv"
    df = load_df(tweetfile)
    text = "\n".join(df['text'].values.tolist()).strip()
    pairs = create_pairs(text)
    trie = build_trie(pairs)
    generated_words = generate1(trie,
                                initial_word='i',
                                max_len=15,
                                verbose=False)
    generated_text = ' '.join(generated_words)
    print("Generated tweet::\n{}".format(generated_text))
    print('-' * 30)
    print("After preprocessing <SENTENCE>::\n{}".format(
        preprocess_sentence(generated_text)))
    def query(self, query_string):
        """query the tfidf index  and return the list of matching doc IDs.

    :param query_string: A query string
    """
        #returns a sorted list of docids, with decreasing cosine similarity
        query_string = self.process_spell_errors(query_string)

        query_terms = preprocess_sentence(query_string)
        query_frequencies = Counter(
            query_terms)  #query_term : frequency(query_term) in the query

        dotproducts = defaultdict(
            int
        )  #sum of dotproduct elements of tfidf of the query and the document
        magnitude = defaultdict(
            int)  #stores the magnitude of the document tfidf vector
        query_magnitude = 0

        #calculate the cosine similarity for the docs
        for term in query_frequencies.keys():

            if term not in self.index:
                continue

            query_tfidf = self.tfidf_score(query_frequencies[term],
                                           self.idf[term])
            query_magnitude += query_tfidf**2
            for doc in self.index[term]:
                doc_tfidf = self.tfidf_score(self.index[term][doc],
                                             self.idf[term])
                dotproducts[doc] += query_tfidf * doc_tfidf
                magnitude[doc] += doc_tfidf**2

        query_magnitude = sqrt(query_magnitude)
        cosine_similarity = {}
        for doc in magnitude:
            cosine_similarity[doc] = dotproducts[doc] / (
                query_magnitude * sqrt(magnitude[doc]) + 1e-10)

        ranked_docs = list(cosine_similarity.items())
        ranked_docs = sorted(ranked_docs, key=lambda x: x[1], reverse=True)
        threshold_docs = list(
            filter(lambda x: x[1] > config_params["threshold_score"],
                   ranked_docs))
        #return docs with score>threshold, or the top 10% docs
        return threshold_docs if len(
            threshold_docs) else ranked_docs[:len(ranked_docs) // 10 + 1]
def evaluate(sentence):
    #This is to store the attention vector for plotting.Ignore this
    attention_plot = np.zeros((max_length_targ, max_length_inp))

    #Preprocessing the sentence.Steps 2,3 and 4
    sentence = preprocess.preprocess_sentence(sentence)
    inputs = [inp_lang.word_index[i] for i in sentence.split(' ')]
    inputs = tf.keras.preprocessing.sequence.pad_sequences(
        [inputs], maxlen=max_length_inp, padding='post')
    #Step 4
    inputs = tf.convert_to_tensor(inputs)

    #creating a string to store the translated sentence
    result = ''

    #Step 5
    hidden = [tf.zeros((1, units))]
    enc_out, enc_hidden = encoder(inputs, hidden)
    dec_hidden = enc_hidden

    #Step 6
    dec_input = tf.expand_dims([targ_lang.word_index['<start>']], 0)

    #Step 7
    for t in range(max_length_targ):
        predictions, dec_hidden, attention_weights = decoder(
            dec_input, dec_hidden, enc_out)

        # storing the attention weights to plot later on.Ignore it
        attention_weights = tf.reshape(attention_weights, (-1, ))
        attention_plot[t] = attention_weights.numpy()

        #Step 8
        predicted_id = tf.argmax(predictions[0]).numpy()

        result += targ_lang.index_word[predicted_id] + ' '

        if targ_lang.index_word[predicted_id] == '<end>':
            return result, sentence, attention_plot

        # the predicted ID is fed back into the model
        dec_input = tf.expand_dims([predicted_id], 0)

    #Return the Original Sentence,Translated Sentence and the history of attention weights
    return result, sentence, attention_plot
Ejemplo n.º 13
0
def load_data(path, src_vocab, src_vocab_len, sen_len, padding = 'post'):
    
    
    word_ind = {}
    for i in src_vocab:
        if i != 'num_words' and int(i) < src_vocab_len:
            word_ind[src_vocab[i]] = i
    
    if path:
    
        lines = io.open(path, encoding='UTF-8').read().strip().split('\n')
        cleaned = [preprocess_sentence(i) for i in lines]
        
        tokenizer = Tokenizer(oov_token='<OOV>')
        tokenizer.word_index = word_ind
        tensor = tokenizer.texts_to_sequences(cleaned)
        padded_tensor = pad_sequences(tensor, padding=padding, truncating='post', maxlen=sen_len)
    
        return padded_tensor, word_ind

    return None, word_ind
Ejemplo n.º 14
0
def translate(x_test = x_test,y_test = y_test,target_vocab = trgt_vocab,source_vocab = src_vocab):
	#we will be able to translate sentence untill we give keyboard interrupt
	print("Enter Sentences for translation : ")
	while(True):
		sentence = input("Source : ")
		#preprocessing the sentence
		sentence1 = pp.preprocess_sentence(sentence)
		#tokenizing the sentence
		sentence1 = pp.preprocess_corpus_translate(sentence1, tokenizers['en'])
		#converting it to tensor
		sentence_tensor = pp.tensors_from_pair_translate(source_vocab,sentence1[0],max_seq_length)
		sentence_tensor = torch.transpose(sentence_tensor, 1, 0)

		#initizaling the model
		net = nmt.seq2seq(len(source_vocab),len(target_vocab),1024,1)
		net = net.cuda()
		net.load_state_dict(torch.load('./model/mdl_weights.pth'))

		#sending the sentence to cuda
		sentence_tensor = sentence_tensor.cuda()
		y = net(sentence_tensor)
		translation = ' '.join(target_vocab.unidex_words(y[1:-1]))
		print('Translation: "{}"\n'.format(translation))
Ejemplo n.º 15
0
def evaluate(sentence, encoder, decoder, inp_lang, targ_lang, max_length_inp,
             max_length_targ):
    attention_plot = np.zeros((max_length_targ, max_length_inp))

    sentence = preprocess_sentence(sentence)

    inputs = [inp_lang.word2idx[i] for i in sentence.split(' ')]
    inputs = tf.keras.preprocessing.sequence.pad_sequences(
        [inputs], maxlen=max_length_inp, padding='post')
    inputs = tf.convert_to_tensor(inputs)

    result = ''

    hidden = [tf.zeros((1, encoder.enc_units))]
    enc_out, enc_hidden = encoder(inputs, hidden)

    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([targ_lang.word2idx['<start>']], 0)

    for t in range(max_length_targ):
        predictions, dec_hidden, attention_weights = decoder(
            dec_input, dec_hidden, enc_out)

        attention_weights = tf.reshape(attention_weights, (-1, ))
        attention_plot[t] = attention_weights.numpy()

        predicted_id = tf.argmax(predictions[0]).numpy()

        result += targ_lang.idx2word[predicted_id] + ' '

        if targ_lang.idx2word[predicted_id] == '<end>':
            return result, sentence, attention_plot

        dec_input = tf.expand_dims([predicted_id], 0)

    return result, sentence, attention_plot
Ejemplo n.º 16
0
        targets = np.ndarray.flatten(
            np.array(np.argmax(outputs, axis=2) * mask))
        f_score_cal = calculate_f_score(predictions, targets)
        #print(loss.shape)
        #optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE).minimize(loss)

        return loss, accuracy, f_score_cal


if __name__ == '__main__':
    input_file = os.path.join(
        '/Users/emielzyde/Desktop/Project/grammar_correction/lang8_preprocess.pickle'
    )
    with open(input_file, 'rb') as f:
        lang_data = pickle.load(f)
    new_data = [[preprocess.preprocess_sentence(w) for w in l.split('\t')]
                for l in lang_data[:NUM_DATA]]

    label_holder = []
    input_sentences = []
    for line in new_data:
        labels = postprocess.sentence_labeller(line[0], line[1])
        label_holder.append(labels)
        input_sentences.append(line[1])

        #label_holder = np.array(label_holder)
    #Pre-process the data
    data_holder = preprocess.Preprocessor(lang_data, NUM_DATA, 'TRAIN')
    _, target_dataset, _, output_table, _, max_length_tar, _, _, _, output_index2word, target_lengths = data_holder.finalise_dataset(
    )
    def break_query(self, query_string):
        """break_query.
    A function to split a query based on wildcard operators

    :param query_string: A query string
    """
        star_flag = 0
        query_string = self.process_spell_errors(query_string)
        query_terms = preprocess_sentence(query_string)
        result_docs = set()
        new_query_terms = []

        for term in query_terms:
            if '*' in term:
                #prefix query
                star_flag = 1
                if term[-1] == '*':
                    term = term[:-1]
                    temp_terms = self.get_words_from_tree(self.tree, term)
                    result_docs = self.update_doclist(result_docs, temp_terms)

                elif term[0] == '*':
                    #suffix query
                    term = term[1:][::-1]
                    temp_terms = []
                    temp_terms = self.get_words_from_tree(
                        self.reverse_tree, term)
                    result_docs = self.update_doclist(result_docs, temp_terms)
                else:
                    #prefix+suffix query
                    pref_terms = []
                    suff_terms = []
                    star_index = term.index('*')
                    prefix_term = term[:star_index]
                    suffix_term = term[star_index + 1:]

                    pref_terms = self.get_words_from_tree(
                        self.tree, prefix_term)
                    suffix_term = suffix_term[::-1]
                    suff_terms = [
                        i[::-1] for i in self.get_words_from_tree(
                            self.reverse_tree, suffix_term)
                    ]
                    result_docs = self.update_doclist(
                        result_docs,
                        list(set(pref_terms).intersection(set(suff_terms))))

            else:
                new_query_terms.append(term)

        query_terms = new_query_terms
        query_terms.sort(key=lambda x: len(self.index[x]))

        #if it is a wild card query
        if star_flag == 1:
            if (len(query_terms) != 0):
                result_docs = set(
                    reduce(lambda x, y: x.intersection(y),
                           map(lambda x: self.index[x],
                               query_terms))).intersection(result_docs)
            return list(result_docs)

        if (len(query_terms) == 0):
            return list()
        return list(
            set(
                reduce(lambda x, y: x.intersection(y),
                       map(lambda x: self.index[x], query_terms))))
Ejemplo n.º 18
0
#Loading all the data from the url
print("Loading Data................\n")
url_en = 'https://nlp.stanford.edu/projects/nmt/data/iwslt15.en-vi/train.en'
url_vi = 'https://nlp.stanford.edu/projects/nmt/data/iwslt15.en-vi/train.vi'
text_en = pd.read_csv(url_en,'\n',header = None)
text_vi = pd.read_csv(url_vi,'\n',header = None)
data = pd.concat([text_en,text_vi],axis=1)
data.columns = ["source","target"]
#Due to some string error taking values in .notnull()
data = data[data["source"].notnull()]
data = data[data["target"].notnull()]


#Send this data in preprocess_sentence where sentence words are lowered and unwanted characters are removed
data["source"] = data.source.apply(lambda w: pp.preprocess_sentence(w))
data["target"] = data.target.apply(lambda w: pp.preprocess_sentence(w))

#Keeping sentences which are less than max_len = 25
# data = data[data["target"].str.split(" ").str.len() <= max_len]
# data = data[data["source"].str.split(" ").str.len() <= max_len]
data = data[(data['source'].str.split(" ").str.len() <= max_len) & (data['target'].str.split(" ").str.len() <= max_len)]
data = data.reset_index().drop('index',1)

#Loading  test Data
url_en_test = 'https://nlp.stanford.edu/projects/nmt/data/iwslt15.en-vi/tst2012.en'
url_vi_test = 'https://nlp.stanford.edu/projects/nmt/data/iwslt15.en-vi/tst2012.vi'
text_en_test = pd.read_csv(url_en_test,'\n',header = None)
text_vi_test = pd.read_csv(url_vi_test,'\n',header = None)

data_test = pd.concat([text_en_test,text_vi_test], axis = 1)