Esempio n. 1
0
def sent_rep(sent):
    tokens = w_tokenizer(sent)
    tokens_rep_list = []
    for token in tokens:
        tokens_rep_list.append(token_lookup(token))
    
    return dy.average(tokens_rep_list)
Esempio n. 2
0
def token_rep(sent):
    #print sent
    tokens = w_tokenizer(sent)
    tokens_rep_list = []
    for token in tokens:
        tokens_rep_list.append(token_lookup(token))
    
    return tokens_rep_list
	def remove_stop_words(self):
		"""
		Removes the arabic stop words from the
		files
		stop-words are loaded from an input file
		"""
		def not_stop_word(word, lo=0):
			"""
			Searches for the word in self.ar_stop_words.
			Uses binary search to reduce search time.
			return value:
				 -1 if word is not a stop-word
				 else its position in the stop-word list 
			"""
			hi = len(self.ar_stop_words)
			pos = bisect_left(self.ar_stop_words, word, lo, hi)
			return (pos if pos != hi and self.ar_stop_words[pos] == word else -1)

		def initialize_dirs(folder):
			"""
			Initialize the reading and writing directories.
			If writing directory does not exist create it.
			"""
			reading_dir = os.path.join(os.sep, self.raw_corpus_path, folder)
			writing_dir = os.path.join(os.sep, self.processed_corpus_path, folder)
			if not os.path.exists(writing_dir):
				try:
					os.makedirs(writing_dir)
				except OSError as e:
					if e.errno != errno.EEXIST:
						raise
			return (reading_dir, writing_dir)

		# reading from reading_dir and writing to writing_dir
		# while eliminating stop_words
		for folder in os.listdir(self.raw_corpus_path):
			(reading_dir, writing_dir) = initialize_dirs(folder)
			for a_file in os.listdir(reading_dir):
				reading_file = os.path.join(os.sep, reading_dir, a_file)
				writing_file = os.path.join(os.sep, writing_dir, a_file)
				to_write = []
				with open (reading_file, 'r') as infile:
					lines = infile.read()
					words = w_tokenizer(lines)
					for word in words:
						if  not_stop_word(word) == -1:
							to_write.append(word)

				with open (writing_file, 'w') as outfile:
					for word in to_write:
						#remove single chars
						if len(word)!=1:
							outfile.write(word+'\n')
			print(folder+" unstopped ")
Esempio n. 4
0
    
    return list(sents.keys())

data_reader = TrainingDataReader()
data_reader.read_paragraph("/home/slouvan/dynet/data/MCTest/mc160.train.tsv")
data_reader.read_answer("/home/slouvan/dynet/data/MCTest/mc160.train.ans")
t_instances = data_reader.construct_training_instances()

train_sentences = collect_sentences(t_instances) # might contain paragraph, so we need to break the sentence
words = []
wc    = Counter()

for data in train_sentences:
    sents = s_tokenizer(data)
    for sent in sents:
        tokens = w_tokenizer(sent)
        for token in tokens:
            words.append(token)
            wc[token]+=1

words.append("__UNK__")
wc["__UNK__"]+=1
vw = Vocab.from_corpus([words])
nwords = vw.size()


# DYNET STUFF
# DyNet Starts
model = dy.Model()
trainer = dy.AdamTrainer(model)
	def tokenize(self):
		self.query_tokens = w_tokenizer(self.query)