def doc_word_embed0(path, no_add_set): with open(path, 'r') as file: lines = file.readlines() lines1 = [] #for line in lines: # lines1.extend(line.lower().split('.') ) #lines = lines1 words = [] vocab, embeds = data.process_glove_data(dim=100) embed_map = dict(zip(vocab, embeds)) tk = tokenizer.WordTokenizer() #list of list of tokens tokens_l = tk.batch_tokenize(lines) stop_word_filter = StopwordFilter() word_embeds = [] words_ar = [] added_set = set(no_add_set) for sentence in tokens_l: sentence = stop_word_filter.filter_words(sentence) for w in sentence: w = w.text.lower() if w in embed_map and w not in added_set: added_set.add(w) words_ar.append(w) word_embeds.append(embed_map[w]) word_embeds = torch.stack(word_embeds, dim=0).to(utils.device) if False: #sanity check word_embeds[:] = word_embeds[0] #word_embeds = word_embeds / (word_embeds**2).sum(dim=1, keepdim=True).sqrt() return words_ar, word_embeds
def doc_word_embed(path, no_add_set, content_lines=None): if content_lines is not None: lines = content_lines else: with open(path, 'r') as file: lines = file.readlines() lines1 = [] words = [] vocab, embeds = data.process_glove_data(dim=100) embed_map = dict(zip(vocab, embeds)) tk = tokenizer.WordTokenizer() #list of list of tokens tokens_l = tk.batch_tokenize(lines) stop_word_filter = StopwordFilter() tokens_l1 = [] for sentence_l in tokens_l: tokens_l1.extend(sentence_l) tokens_l = [tokens_l1] n_avg = 5 #5 word_embeds = [] words_ar = [] added_set = set(no_add_set) for sentence in tokens_l: sentence = stop_word_filter.filter_words(sentence) cur_embed = torch.zeros_like(embed_map['a']) cur_counter = 0 for j, w in enumerate(sentence): w = w.text.lower() if w in embed_map: # and w not in added_set: if cur_counter == n_avg: # or j==len(sentence)-1: added_set.add(w) words_ar.append(w) #word_embeds.append(embed_map[w]) #word_embeds.append(cur_embed/(cur_counter if cur_counter > 0 else 1)) word_embeds.append(cur_embed / n_avg) cur_embed = torch.zeros_like(embed_map['a']) cur_counter = 0 else: cur_counter += 1 cur_embed += embed_map[w] word_embeds = torch.stack(word_embeds, dim=0).to(utils.device) if False: #is_noise :#False: #sanity check word_embeds[:] = word_embeds.mean(0) #word_embeds[0] return words_ar, word_embeds
def doc_sentence_embed(path): with open(path, 'r') as file: lines = file.readlines() lines1 = [] for line in lines: lines1.extend(line.lower().split('.')) lines = lines1 words = [] vocab, embeds = data.process_glove_data(dim=100) embed_map = dict(zip(vocab, embeds)) tk = tokenizer.WordTokenizer() tokens_l = tk.batch_tokenize(lines) word_embeds = [] words_ar = [] added_set = set() for sentence in tokens_l: if len(sentence) < 3: continue sentence_embed = 0 aa = True for w in sentence: w = w.text.lower() if w in embed_map: # and w not in added_set: ##added_set.add(w) ##words_ar.append(w) ##word_embeds.append(embed_map[w]) sentence_embed += embed_map[w] aa = False if aa: continue words_ar.append(sentence) word_embeds.append(sentence_embed / len(sentence)) word_embeds = torch.stack(word_embeds, dim=0).to(utils.device) #word_embeds = word_embeds / (word_embeds**2).sum(dim=1, keepdim=True).sqrt() return words_ar, word_embeds
def doc_word_embed_sen(path, no_add_set, content_lines=None): if content_lines is not None: lines = content_lines else: with open(path, 'r') as file: lines = file.readlines() lines1 = [] patt = re.compile('[;\.:!,?]') for line in lines: #cur_lines = [] for cur_line in patt.split(line): lines1.append(cur_line) #lines1.append(cur_lines) lines = lines1 words = [] vocab, embeds = data.process_glove_data(dim=100) embed_map = dict(zip(vocab, embeds)) #list of list of tokens tokens_l = tk.batch_tokenize(lines) ''' tokens_l1 = [] for sentence_l in tokens_l: tokens_l1.extend(sentence_l) tokens_l = [tokens_l1] ''' max_len = 200 word_embeds = [] words_ar = [] #added_set = set(no_add_set) for sentence in tokens_l: sentence = stop_word_filter.filter_words(sentence) if len(sentence) < 4: continue cur_embed = torch.zeros_like(embed_map['a']) cur_counter = 0 for j,w in enumerate(sentence): w = w.text.lower() if USE_ALLENNLP else w.lower() if w in embed_map:# and w not in added_set: if cur_counter == max_len:# or j==len(sentence)-1: #added_set.add(w) words_ar.append(w) #word_embeds.append(embed_map[w]) #word_embeds.append(cur_embed/(cur_counter if cur_counter > 0 else 1)) word_embeds.append(cur_embed/max_len) cur_embed = torch.zeros_like(embed_map['a']) cur_counter = 0 else: cur_counter += 1 cur_embed += embed_map[w] word_embeds.append(cur_embed / len(sentence)) word_embeds = torch.stack(word_embeds, dim=0).to(utils.device) if False: #is_noise :#False: #sanity check word_embeds[:] = word_embeds.mean(0) #word_embeds[0] return words_ar, word_embeds