Example #1
0
	def __init__(self,docs,num_clu):
		self.no_clusters = num_clu
		print "Loading Sentences..."
		self.sentences =  preprocessing.load_sentences(docs)
		print "Preprocessing..."
		self.sent_no_swords = preprocessing.remove_stopwords(self.sentences)
		self.unique_terms = helper.uniqueterms(self.sent_no_swords)
		self.sent_weight = helper.tfisf(self.sent_no_swords,self.unique_terms)
		#self.sent_weight = helper.word_vector(self.sent_no_swords,self.unique_terms)
		print "Finding Similarity Graph..."
		self.sent_similarity = helper.similarity(self.sent_weight,self.sent_weight)
		print "Clustering..."
		self.clusters = cluster.kmedoid(self.sent_similarity,self.no_clusters)
		'''
Example #2
0
 def __init__(self, docs, num_clu):
     self.no_clusters = num_clu
     print "Loading Sentences..."
     self.sentences = preprocessing.load_sentences(docs)
     print "Preprocessing..."
     self.sent_no_swords = preprocessing.remove_stopwords(self.sentences)
     self.unique_terms = helper.uniqueterms(self.sent_no_swords)
     self.sent_weight = helper.tfisf(self.sent_no_swords, self.unique_terms)
     #self.sent_weight = helper.word_vector(self.sent_no_swords,self.unique_terms)
     print "Finding Similarity Graph..."
     self.sent_similarity = helper.similarity(self.sent_weight,
                                              self.sent_weight)
     print "Clustering..."
     self.clusters = cluster.kmedoid(self.sent_similarity, self.no_clusters)
     '''
def phrase_extraction (e_file, f_file, alignment): 
    e_list, f_list = load_sentences(e_file, f_file)
    A = load_alignment(alignment)
    BP = set()
    
    for i in range(len(e_list)):
        for e_start in range(1, len(e_list[i]) + 1):
            for e_end in range(e_start, len(e_list[i]) + 1):
                f_start, f_end = len(f_list[i]), 0
                for pair in A[i]:
                    e, f = int(pair[0]), int(pair[1])
                    if e_start <= e and e <= e_end:
                        f_start = min(f, f_start)
                        f_end = max(f, f_end)
                new_phrases = extract(A[i], f_start, f_end, e_start, e_end, e_list[i], f_list[i])
                for phrase in new_phrases:
                    BP.add(phrase)
                       
    return BP
Example #4
0
parameters['char_mode'] = opts.char_mode
parameters['reload'] = opts.reload == 1
parameters['name'] = opts.name
parameters['epoches'] = opts.epoches
parameters['lr'] = opts.lr
parameters['momentum'] = opts.momentum

parameters['use_gpu'] = opts.use_gpu == 1 and torch.cuda.is_available()
use_gpu = parameters['use_gpu']

lower = parameters['lower']

name = parameters['name']
model_name = models_path + name + ".pt"  # get_name(parameters)

train_sentences = load_sentences(data_path, lower, "train")
dev_sentences = load_sentences(data_path, lower, "dev")
test_train_sentences = load_sentences(data_path, lower, "train_test")

dico_words_train = word_mapping(train_sentences, lower)[0]

dico_words, word_to_id, id_to_word = augment_with_pretrained(
    dico_words_train.copy(), parameters['pre_emb'],
    list(
        itertools.chain.from_iterable([[w[0] for w in s]
                                       for s in dev_sentences]))
    if not parameters['all_emb'] else None)

dico_chars, char_to_id, id_to_char = char_mapping(train_sentences)
dico_tags, tag_to_id, id_to_tag = tag_mapping(data_path, "train")