def sent_rep(sent): tokens = w_tokenizer(sent) tokens_rep_list = [] for token in tokens: tokens_rep_list.append(token_lookup(token)) return dy.average(tokens_rep_list)
def token_rep(sent): #print sent tokens = w_tokenizer(sent) tokens_rep_list = [] for token in tokens: tokens_rep_list.append(token_lookup(token)) return tokens_rep_list
def remove_stop_words(self): """ Removes the arabic stop words from the files stop-words are loaded from an input file """ def not_stop_word(word, lo=0): """ Searches for the word in self.ar_stop_words. Uses binary search to reduce search time. return value: -1 if word is not a stop-word else its position in the stop-word list """ hi = len(self.ar_stop_words) pos = bisect_left(self.ar_stop_words, word, lo, hi) return (pos if pos != hi and self.ar_stop_words[pos] == word else -1) def initialize_dirs(folder): """ Initialize the reading and writing directories. If writing directory does not exist create it. """ reading_dir = os.path.join(os.sep, self.raw_corpus_path, folder) writing_dir = os.path.join(os.sep, self.processed_corpus_path, folder) if not os.path.exists(writing_dir): try: os.makedirs(writing_dir) except OSError as e: if e.errno != errno.EEXIST: raise return (reading_dir, writing_dir) # reading from reading_dir and writing to writing_dir # while eliminating stop_words for folder in os.listdir(self.raw_corpus_path): (reading_dir, writing_dir) = initialize_dirs(folder) for a_file in os.listdir(reading_dir): reading_file = os.path.join(os.sep, reading_dir, a_file) writing_file = os.path.join(os.sep, writing_dir, a_file) to_write = [] with open (reading_file, 'r') as infile: lines = infile.read() words = w_tokenizer(lines) for word in words: if not_stop_word(word) == -1: to_write.append(word) with open (writing_file, 'w') as outfile: for word in to_write: #remove single chars if len(word)!=1: outfile.write(word+'\n') print(folder+" unstopped ")
return list(sents.keys()) data_reader = TrainingDataReader() data_reader.read_paragraph("/home/slouvan/dynet/data/MCTest/mc160.train.tsv") data_reader.read_answer("/home/slouvan/dynet/data/MCTest/mc160.train.ans") t_instances = data_reader.construct_training_instances() train_sentences = collect_sentences(t_instances) # might contain paragraph, so we need to break the sentence words = [] wc = Counter() for data in train_sentences: sents = s_tokenizer(data) for sent in sents: tokens = w_tokenizer(sent) for token in tokens: words.append(token) wc[token]+=1 words.append("__UNK__") wc["__UNK__"]+=1 vw = Vocab.from_corpus([words]) nwords = vw.size() # DYNET STUFF # DyNet Starts model = dy.Model() trainer = dy.AdamTrainer(model)
def tokenize(self): self.query_tokens = w_tokenizer(self.query)