Esempio n. 1
0
def doc_word_embed0(path, no_add_set):
    with open(path, 'r') as file:
        lines = file.readlines()

    lines1 = []
    #for line in lines:
    #    lines1.extend(line.lower().split('.') )
    #lines = lines1

    words = []
    vocab, embeds = data.process_glove_data(dim=100)
    embed_map = dict(zip(vocab, embeds))

    tk = tokenizer.WordTokenizer()
    #list of list of tokens
    tokens_l = tk.batch_tokenize(lines)
    stop_word_filter = StopwordFilter()

    word_embeds = []
    words_ar = []
    added_set = set(no_add_set)
    for sentence in tokens_l:
        sentence = stop_word_filter.filter_words(sentence)
        for w in sentence:
            w = w.text.lower()
            if w in embed_map and w not in added_set:
                added_set.add(w)
                words_ar.append(w)
                word_embeds.append(embed_map[w])

    word_embeds = torch.stack(word_embeds, dim=0).to(utils.device)
    if False:  #sanity check
        word_embeds[:] = word_embeds[0]
    #word_embeds = word_embeds / (word_embeds**2).sum(dim=1, keepdim=True).sqrt()
    return words_ar, word_embeds
Esempio n. 2
0
def doc_word_embed(path, no_add_set, content_lines=None):
    if content_lines is not None:
        lines = content_lines
    else:
        with open(path, 'r') as file:
            lines = file.readlines()

    lines1 = []
    words = []
    vocab, embeds = data.process_glove_data(dim=100)
    embed_map = dict(zip(vocab, embeds))

    tk = tokenizer.WordTokenizer()
    #list of list of tokens
    tokens_l = tk.batch_tokenize(lines)
    stop_word_filter = StopwordFilter()

    tokens_l1 = []
    for sentence_l in tokens_l:
        tokens_l1.extend(sentence_l)
    tokens_l = [tokens_l1]

    n_avg = 5  #5
    word_embeds = []
    words_ar = []
    added_set = set(no_add_set)
    for sentence in tokens_l:

        sentence = stop_word_filter.filter_words(sentence)
        cur_embed = torch.zeros_like(embed_map['a'])
        cur_counter = 0
        for j, w in enumerate(sentence):
            w = w.text.lower()
            if w in embed_map:  # and w not in added_set:
                if cur_counter == n_avg:  # or j==len(sentence)-1:
                    added_set.add(w)
                    words_ar.append(w)
                    #word_embeds.append(embed_map[w])
                    #word_embeds.append(cur_embed/(cur_counter if cur_counter > 0 else 1))
                    word_embeds.append(cur_embed / n_avg)

                    cur_embed = torch.zeros_like(embed_map['a'])
                    cur_counter = 0
                else:
                    cur_counter += 1
                    cur_embed += embed_map[w]

    word_embeds = torch.stack(word_embeds, dim=0).to(utils.device)
    if False:  #is_noise :#False: #sanity check
        word_embeds[:] = word_embeds.mean(0)  #word_embeds[0]
    return words_ar, word_embeds
Esempio n. 3
0
def doc_sentence_embed(path):
    with open(path, 'r') as file:
        lines = file.readlines()

    lines1 = []
    for line in lines:
        lines1.extend(line.lower().split('.'))

    lines = lines1
    words = []
    vocab, embeds = data.process_glove_data(dim=100)
    embed_map = dict(zip(vocab, embeds))

    tk = tokenizer.WordTokenizer()
    tokens_l = tk.batch_tokenize(lines)
    word_embeds = []
    words_ar = []
    added_set = set()
    for sentence in tokens_l:
        if len(sentence) < 3:
            continue
        sentence_embed = 0
        aa = True
        for w in sentence:
            w = w.text.lower()
            if w in embed_map:  # and w not in added_set:
                ##added_set.add(w)
                ##words_ar.append(w)
                ##word_embeds.append(embed_map[w])
                sentence_embed += embed_map[w]
                aa = False
        if aa:
            continue
        words_ar.append(sentence)
        word_embeds.append(sentence_embed / len(sentence))

    word_embeds = torch.stack(word_embeds, dim=0).to(utils.device)
    #word_embeds = word_embeds / (word_embeds**2).sum(dim=1, keepdim=True).sqrt()
    return words_ar, word_embeds
Esempio n. 4
0
def doc_word_embed_sen(path, no_add_set, content_lines=None):
    if content_lines is not None:
        lines = content_lines
    else:
        with open(path, 'r') as file:
            lines = file.readlines()
            
    lines1 = []
    patt = re.compile('[;\.:!,?]')
    for line in lines:
        #cur_lines = []
        for cur_line in patt.split(line):
            lines1.append(cur_line)            
        #lines1.append(cur_lines)
    lines = lines1    
    
    words = []
    vocab, embeds = data.process_glove_data(dim=100)
    embed_map = dict(zip(vocab, embeds))    
    #list of list of tokens
    tokens_l = tk.batch_tokenize(lines)    

    '''
    tokens_l1 = []
    for sentence_l in tokens_l:
        tokens_l1.extend(sentence_l)
    tokens_l = [tokens_l1]
    '''
    max_len = 200
    word_embeds = []
    words_ar = []
    #added_set = set(no_add_set)
    for sentence in tokens_l:        
        sentence = stop_word_filter.filter_words(sentence)        
        if len(sentence) < 4:
            continue
        cur_embed = torch.zeros_like(embed_map['a'])
        cur_counter = 0
        for j,w in enumerate(sentence):
            w = w.text.lower() if USE_ALLENNLP else w.lower()
            if w in embed_map:# and w not in added_set:
                if cur_counter == max_len:# or j==len(sentence)-1:
                    #added_set.add(w)
                    words_ar.append(w)
                    #word_embeds.append(embed_map[w])
                    #word_embeds.append(cur_embed/(cur_counter if cur_counter > 0 else 1))
                    word_embeds.append(cur_embed/max_len)
                    
                    cur_embed = torch.zeros_like(embed_map['a'])
                    cur_counter = 0      
                else:
                    cur_counter += 1
                    cur_embed += embed_map[w]
        
        word_embeds.append(cur_embed / len(sentence))
        
    word_embeds = torch.stack(word_embeds, dim=0).to(utils.device)    
    if False: #is_noise :#False: #sanity check
        word_embeds[:] = word_embeds.mean(0) #word_embeds[0]
    
    return words_ar, word_embeds