def __init__(self,docs,num_clu): self.no_clusters = num_clu print "Loading Sentences..." self.sentences = preprocessing.load_sentences(docs) print "Preprocessing..." self.sent_no_swords = preprocessing.remove_stopwords(self.sentences) self.unique_terms = helper.uniqueterms(self.sent_no_swords) self.sent_weight = helper.tfisf(self.sent_no_swords,self.unique_terms) #self.sent_weight = helper.word_vector(self.sent_no_swords,self.unique_terms) print "Finding Similarity Graph..." self.sent_similarity = helper.similarity(self.sent_weight,self.sent_weight) print "Clustering..." self.clusters = cluster.kmedoid(self.sent_similarity,self.no_clusters) '''
def __init__(self, docs, num_clu): self.no_clusters = num_clu print "Loading Sentences..." self.sentences = preprocessing.load_sentences(docs) print "Preprocessing..." self.sent_no_swords = preprocessing.remove_stopwords(self.sentences) self.unique_terms = helper.uniqueterms(self.sent_no_swords) self.sent_weight = helper.tfisf(self.sent_no_swords, self.unique_terms) #self.sent_weight = helper.word_vector(self.sent_no_swords,self.unique_terms) print "Finding Similarity Graph..." self.sent_similarity = helper.similarity(self.sent_weight, self.sent_weight) print "Clustering..." self.clusters = cluster.kmedoid(self.sent_similarity, self.no_clusters) '''
def phrase_extraction (e_file, f_file, alignment): e_list, f_list = load_sentences(e_file, f_file) A = load_alignment(alignment) BP = set() for i in range(len(e_list)): for e_start in range(1, len(e_list[i]) + 1): for e_end in range(e_start, len(e_list[i]) + 1): f_start, f_end = len(f_list[i]), 0 for pair in A[i]: e, f = int(pair[0]), int(pair[1]) if e_start <= e and e <= e_end: f_start = min(f, f_start) f_end = max(f, f_end) new_phrases = extract(A[i], f_start, f_end, e_start, e_end, e_list[i], f_list[i]) for phrase in new_phrases: BP.add(phrase) return BP
parameters['char_mode'] = opts.char_mode parameters['reload'] = opts.reload == 1 parameters['name'] = opts.name parameters['epoches'] = opts.epoches parameters['lr'] = opts.lr parameters['momentum'] = opts.momentum parameters['use_gpu'] = opts.use_gpu == 1 and torch.cuda.is_available() use_gpu = parameters['use_gpu'] lower = parameters['lower'] name = parameters['name'] model_name = models_path + name + ".pt" # get_name(parameters) train_sentences = load_sentences(data_path, lower, "train") dev_sentences = load_sentences(data_path, lower, "dev") test_train_sentences = load_sentences(data_path, lower, "train_test") dico_words_train = word_mapping(train_sentences, lower)[0] dico_words, word_to_id, id_to_word = augment_with_pretrained( dico_words_train.copy(), parameters['pre_emb'], list( itertools.chain.from_iterable([[w[0] for w in s] for s in dev_sentences])) if not parameters['all_emb'] else None) dico_chars, char_to_id, id_to_char = char_mapping(train_sentences) dico_tags, tag_to_id, id_to_tag = tag_mapping(data_path, "train")